In [25]:
# Cell 01: Setup and Imports

# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)

In [26]:
# Cell 02: Data Loading and Basic Inspection

print("\nLOADING DATASET...")
df = pd.read_csv('../data/raw/world_happiness.csv', sep=';', decimal=',')

print(f"Dataset Loaded Successfully!")

print("\nDATASET OVERVIEW:")
print(f"Dataset Shape: {df.shape} ") # ( Row * Columns )
print(f"Memory Usage: {df.memory_usage(deep=True).sum() /1024**2:.2f} MB")
# df.memory_usage(deep=True) - Calculates memory usage for each column
# .sum() - Sums up the memory usage of all columns
# / 1024**2 - Converts bytes to megabytes (1024² = 1,048,576 bytes per MB)
# :.2f - Formats the result to 2 decimal places

print(f"Date Range: {df['Year'].min()} - {df['Year'].max()}")

print(f"Countries Covered: {df['Country'].nunique()}")
# .nunique() - Counts the number of unique/distinct values

# Displaying basic info
print(f"\nDATASET INFO:")
print(df.info())

print(f"\nFIRST 5 ROWS:")
print(df.head())

print(f"\nLAST 5 ROWS:")
print(df.tail())


LOADING DATASET...
Dataset Loaded Successfully!

DATASET OVERVIEW:
Dataset Shape: (1502, 11) 
Memory Usage: 0.29 MB
Date Range: 2015 - 2024
Countries Covered: 175

DATASET INFO:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1502 entries, 0 to 1501
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ranking                       1502 non-null   int64  
 1   Country                       1502 non-null   object 
 2   Regional indicator            1499 non-null   object 
 3   Happiness score               1502 non-null   float64
 4   GDP per capita                1502 non-null   float64
 5   Social support                1502 non-null   float64
 6   Healthy life expectancy       1502 non-null   int64  
 7   Freedom to make life choices  1502 non-null   float64
 8   Generosity                    1502 non-null   float64
 9   Perceptions of corruption     1502 non-null   float64
 10  Y

In [27]:
# Cell 03: COLUMN ANALYSIS
print("\nColumn Details")

for i, col in enumerate (df.columns,1):
    print(f"{i:2d}. {col}")
    if df[col].dtype in ['object']:
        unique_count = df[col].nunique()
        print(f"Type: {df[col].dtype} | Unique values: {unique_count}")
        if unique_count <= 10:
            print(f"Values: {list(df[col].unique())}")
        else:
            print(f"Sample Values: {list(df[col].unique()[:5])}...")
    else:
        print(f"Type: {df[col].dtype} | Range: {df[col].min():.2f} to {df[col].max():.2f}")
    print()
        


Column Details
 1. Ranking
Type: int64 | Range: 1.00 to 158.00

 2. Country
Type: object | Unique values: 175
Sample Values: ['Switzerland', 'Iceland', 'Denmark', 'Norway', 'Canada']...

 3. Regional indicator
Type: object | Unique values: 10
Values: ['Western Europe', 'North America and ANZ', 'Middle East and North Africa', 'Latin America and Caribbean', 'Southeast Asia', 'Central and Eastern Europe', 'East Asia', 'Commonwealth of Independent States', 'Sub-Saharan Africa', 'South Asia', nan]

 4. Happiness score
Type: float64 | Range: 1.72 to 7.84

 5. GDP per capita
Type: float64 | Range: 0.00 to 10.00

 6. Social support
Type: float64 | Range: 0.00 to 1.00

 7. Healthy life expectancy
Type: int64 | Range: 39.00 to 85.00

 8. Freedom to make life choices
Type: float64 | Range: 0.00 to 1.00

 9. Generosity
Type: float64 | Range: 0.00 to 1.00

10. Perceptions of corruption
Type: float64 | Range: 0.00 to 1.00

11. Year
Type: int64 | Range: 2015.00 to 2024.00



In [28]:
# Cell 04: MISSING VALUES ANALYSIS
# Checking for null values and calculate missing data percentages

print("\nMISSING VALUES ANALYSIS")
missing_data = df.isnull().sum()  # A pandas Series showing the count of missing values for each column
missing_percent = (missing_data/ len(df)) * 100 # : A pandas Series showing the percentage of missing values for each column

missing_df = pd.DataFrame({
    'Column': missing_data.index,        # Column names
    'Missing Count': missing_data.values, # Count of missing values
    'Missing %': missing_percent.values   # Percentage of missing values
})

print(missing_df)

if missing_data.sum() > 0:
    print(f"\nTotal Missing Values: {missing_data.sum()}")
else:
    print(f"\nNo Missing Values")


MISSING VALUES ANALYSIS
                          Column  Missing Count  Missing %
0                        Ranking              0   0.000000
1                        Country              0   0.000000
2             Regional indicator              3   0.199734
3                Happiness score              0   0.000000
4                 GDP per capita              0   0.000000
5                 Social support              0   0.000000
6        Healthy life expectancy              0   0.000000
7   Freedom to make life choices              0   0.000000
8                     Generosity              0   0.000000
9      Perceptions of corruption              0   0.000000
10                          Year              0   0.000000

Total Missing Values: 3


In [29]:
# Cell 05: Statistical Summary
# Generating descriptive statistics for numerical variables

print("\nSTATISTICAL VARIABLE")

print(df.describe())

print("\nDETAILED STATISTICS FOR KEY VARIABLES")
key_vars = ['Happiness score', 'GDP per capita', 'Social support', 
           'Healthy life expectancy', 'Freedom to make life choices', 
           'Generosity', 'Perceptions of corruption']

for var in key_vars:
    #It checks if the current variable name stored in var actually exists as a column inside the DataFrame (df).
    if var in df.columns:
        print(f"\n{var}")
        print(f"Mean: {df[var].mean():.3f}")
        print(f"Median: {df[var].median():.3f}")
        print(f"Standard Deviation: {df[var].std():.3f}")
        print(f"Min: {df[var].min():.3f}")
        print(f"Max: {df[var].max():.3f}")
        print(f"Skewness: {df[var].skew():.3f}")


STATISTICAL VARIABLE
           Ranking  Happiness score  GDP per capita  Social support  \
count  1502.000000      1502.000000     1502.000000     1502.000000   
mean     76.035286         5.448857        6.107178        0.691842   
std      43.865013         1.125638        2.499571        0.212647   
min       1.000000         1.721000        0.000000        0.000000   
25%      38.000000         4.593425        4.375967        0.564507   
50%      76.000000         5.469650        6.305600        0.738190   
75%     114.000000         6.278450        8.047867        0.861528   
max     158.000000         7.842100       10.000000        1.000000   

       Healthy life expectancy  Freedom to make life choices   Generosity  \
count              1502.000000                   1502.000000  1502.000000   
mean                 66.670439                      0.658935     0.320369   
std                   7.671376                      0.216441     0.172669   
min                  39.000000

In [30]:
# Cell 06: TEMPORAL ANALYSIS
# EXAMINING DATA COVERAGE ACROSS YEARS AND IDENTIFYING PATTERNS OVER TIME
print(f"\nTEMPORAL ANALYSIS")
print(f"Years covered: {sorted(df['Year'].unique())}")

print(f"\nData Points Per Year")
year_counts = df['Year'].value_counts().sort_index()
for year, count in year_counts.items():
    print(f"{year}: {count} countries")

print(f"\nAverage countries per year: {year_counts.mean():.1f}")


TEMPORAL ANALYSIS
Years covered: [np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024)]

Data Points Per Year
2015: 158 countries
2016: 157 countries
2017: 155 countries
2018: 155 countries
2019: 155 countries
2020: 152 countries
2021: 148 countries
2022: 145 countries
2023: 137 countries
2024: 140 countries

Average countries per year: 150.2


In [31]:
# Cell 07: GEOGRAPHIC ANALYSIS
# ANALYZING COUNTRY AND REGIONAL DISTRIBUTION IN THE DATASET
print(f"\nGEOGRAPHIC ANALYSIS")
print(f"Total Unique Countries: {df['Country'].nunique()}")
print(f"Total Unique Regions: {df['Regional indicator'].nunique()}")

print('\nCountries per region:')
region_counts = df['Regional indicator'].value_counts()
for region, count in region_counts.items():
    unique_countries = df[df['Regional indicator'] == region]['Country'].nunique()
    print(f"{region}: {unique_countries} countries ({count} data points) ")

print(f"\nMost reporesented countries (across all years)")
country_counts = df['Country'].value_counts().head(10)
for country, count in country_counts.items():
    print(f"{country} = {count} years")
    


GEOGRAPHIC ANALYSIS
Total Unique Countries: 175
Total Unique Regions: 10

Countries per region:
Sub-Saharan Africa: 49 countries (370 data points) 
Western Europe: 38 countries (232 data points) 
Latin America and Caribbean: 25 countries (209 data points) 
Middle East and North Africa: 28 countries (189 data points) 
Central and Eastern Europe: 20 countries (141 data points) 
Commonwealth of Independent States: 12 countries (103 data points) 
Southeast Asia: 10 countries (82 data points) 
East Asia: 11 countries (68 data points) 
South Asia: 7 countries (65 data points) 
North America and ANZ: 4 countries (40 data points) 

Most reporesented countries (across all years)
Switzerland = 10 years
Iceland = 10 years
Denmark = 10 years
Norway = 10 years
Canada = 10 years
Finland = 10 years
Netherlands = 10 years
Sweden = 10 years
New Zealand = 10 years
Australia = 10 years


In [32]:
# Cell 08: HAPPINESS DEEP SCORE DEEP DIVE
# Detailed analysis of the target variable (happiness score)

print(f"\nHAPPINESS SCORE DEEP DIVE:")
happiness_col = 'Happiness score'

print(f"Happiness Score Distribution")
print(f"Range: {df[happiness_col].min():.3f} - {df[happiness_col].max():.3f}")
print(f"Global Average: {df[happiness_col].mean():.3f}")
print(f"Global Median: {df[happiness_col].median():.3f}")
print(f"Standard Deviation: {df[happiness_col].std():.3f}")

# Percentile
print(f"\nHappiness Score Percentile")
for p in [10,25,50,75,90]:
    value = df[happiness_col].quantile(p/100)
    print(f" {p}th percentile: {value:.3f}")


HAPPINESS SCORE DEEP DIVE:
Happiness Score Distribution
Range: 1.721 - 7.842
Global Average: 5.449
Global Median: 5.470
Standard Deviation: 1.126

Happiness Score Percentile
 10th percentile: 3.957
 25th percentile: 4.593
 50th percentile: 5.470
 75th percentile: 6.278
 90th percentile: 6.976
