In [6]:
# Cell 01: Setup and Imports

# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)

In [7]:
# Cell 02: Data Loading and Basic Inspection

print("\nLOADING DATASET...")
df = pd.read_csv('../data/raw/world_happiness.csv', sep=';', decimal=',')

print(f"Dataset Loaded Successfully!")

print("\nDATASET OVERVIEW:")
print(f"Dataset Shape: {df.shape} ") # ( Row * Columns )
print(f"Memory Usage: {df.memory_usage(deep=True).sum() /1024**2:.2f} MB")
# df.memory_usage(deep=True) - Calculates memory usage for each column
# .sum() - Sums up the memory usage of all columns
# / 1024**2 - Converts bytes to megabytes (1024² = 1,048,576 bytes per MB)
# :.2f - Formats the result to 2 decimal places

print(f"Date Range: {df['Year'].min()} - {df['Year'].max()}")

print(f"Countries Covered: {df['Country'].nunique()}")
# .nunique() - Counts the number of unique/distinct values

# Displaying basic info
print(f"\nDATASET INFO:")
print(df.info())

print(f"\nFIRST 5 ROWS:")
print(df.head())

print(f"\nLAST 5 ROWS:")
print(df.tail())


LOADING DATASET...
Dataset Loaded Successfully!

DATASET OVERVIEW:
Dataset Shape: (1502, 11) 
Memory Usage: 0.29 MB
Date Range: 2015 - 2024
Countries Covered: 175

DATASET INFO:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1502 entries, 0 to 1501
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ranking                       1502 non-null   int64  
 1   Country                       1502 non-null   object 
 2   Regional indicator            1499 non-null   object 
 3   Happiness score               1502 non-null   float64
 4   GDP per capita                1502 non-null   float64
 5   Social support                1502 non-null   float64
 6   Healthy life expectancy       1502 non-null   int64  
 7   Freedom to make life choices  1502 non-null   float64
 8   Generosity                    1502 non-null   float64
 9   Perceptions of corruption     1502 non-null   float64
 10  Y

In [8]:
# Cell 03: COLUMN ANALYSIS
print("\nColumn Details")

for i, col in enumerate (df.columns,1):
    print(f"{i:2d}. {col}")
    if df[col].dtype in ['object']:
        unique_count = df[col].nunique()
        print(f"Type: {df[col].dtype} | Unique values: {unique_count}")
        if unique_count <= 10:
            print(f"Values: {list(df[col].unique())}")
        else:
            print(f"Sample Values: {list(df[col].unique()[:5])}...")
    else:
        print(f"Type: {df[col].dtype} | Range: {df[col].min():.2f} to {df[col].max():.2f}")
    print()
        


Column Details
 1. Ranking
Type: int64 | Range: 1.00 to 158.00

 2. Country
Type: object | Unique values: 175
Sample Values: ['Switzerland', 'Iceland', 'Denmark', 'Norway', 'Canada']...

 3. Regional indicator
Type: object | Unique values: 10
Values: ['Western Europe', 'North America and ANZ', 'Middle East and North Africa', 'Latin America and Caribbean', 'Southeast Asia', 'Central and Eastern Europe', 'East Asia', 'Commonwealth of Independent States', 'Sub-Saharan Africa', 'South Asia', nan]

 4. Happiness score
Type: float64 | Range: 1.72 to 7.84

 5. GDP per capita
Type: float64 | Range: 0.00 to 10.00

 6. Social support
Type: float64 | Range: 0.00 to 1.00

 7. Healthy life expectancy
Type: int64 | Range: 39.00 to 85.00

 8. Freedom to make life choices
Type: float64 | Range: 0.00 to 1.00

 9. Generosity
Type: float64 | Range: 0.00 to 1.00

10. Perceptions of corruption
Type: float64 | Range: 0.00 to 1.00

11. Year
Type: int64 | Range: 2015.00 to 2024.00



In [9]:
# Cell 04: MISSING VALUES ANALYSIS
# Checking for null values and calculate missing data percentages

print("\nMISSING VALUES ANALYSIS")
missing_data = df.isnull().sum()  # A pandas Series showing the count of missing values for each column
missing_percent = (missing_data/ len(df)) * 100 # : A pandas Series showing the percentage of missing values for each column

missing_df = pd.DataFrame({
    'Column': missing_data.index,        # Column names
    'Missing Count': missing_data.values, # Count of missing values
    'Missing %': missing_percent.values   # Percentage of missing values
})

print(missing_df)

if missing_data.sum() > 0:
    print(f"\nTotal Missing Values: {missing_data.sum()}")
else:
    print(f"\nNo Missing Values")


MISSING VALUES ANALYSIS
                          Column  Missing Count  Missing %
0                        Ranking              0   0.000000
1                        Country              0   0.000000
2             Regional indicator              3   0.199734
3                Happiness score              0   0.000000
4                 GDP per capita              0   0.000000
5                 Social support              0   0.000000
6        Healthy life expectancy              0   0.000000
7   Freedom to make life choices              0   0.000000
8                     Generosity              0   0.000000
9      Perceptions of corruption              0   0.000000
10                          Year              0   0.000000

Total Missing Values: 3
