<a href="https://colab.research.google.com/github/carogaltier/world-happiness-report/blob/main/World_Happiness_Report_2020_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd

In [None]:
# Define file paths (adjust the paths as needed)
files = {
    2024: "/content/WHR2024.csv",
    2023: "/content/WHR2023.csv",
    2022: "/content/WHR2022.csv",
    2021: "/content/WHR2021.csv",
    2020: "/content/WHR2020.csv"
}

In [None]:
# Function to load and process each file
def process_file(year, path):
    print(f"Processing file for year: {year} from {path}")  # Debug: Output which file is being processed
    df = pd.read_csv(path)
    if year == 2024:
        df = df[['Country name', 'Ladder score', 'upperwhisker', 'lowerwhisker', 'Explained by: Log GDP per capita',
                 'Explained by: Social support', 'Explained by: Healthy life expectancy',
                 'Explained by: Freedom to make life choices', 'Explained by: Generosity',
                 'Explained by: Perceptions of corruption', 'Dystopia + residual']]
    elif year == 2023:
        df = df[['Country name', 'Ladder score', 'upperwhisker', 'lowerwhisker', 'Explained by: Log GDP per capita',
                 'Explained by: Social support', 'Explained by: Healthy life expectancy',
                 'Explained by: Freedom to make life choices', 'Explained by: Generosity',
                 'Explained by: Perceptions of corruption', 'Dystopia + residual']]
    elif year == 2022:
        df = df.rename(columns={'Country': 'Country name', 'Happiness score': 'Ladder score',
                                'Whisker-high': 'upperwhisker', 'Whisker-low': 'lowerwhisker',
                                'Explained by: GDP per capita': 'Explained by: Log GDP per capita',
                                'Dystopia (1.83) + residual': 'Dystopia + residual'})
        df = df[['Country name', 'Ladder score', 'upperwhisker', 'lowerwhisker', 'Explained by: Log GDP per capita',
                 'Explained by: Social support', 'Explained by: Healthy life expectancy',
                 'Explained by: Freedom to make life choices', 'Explained by: Generosity',
                 'Explained by: Perceptions of corruption', 'Dystopia + residual']]
    else:  # For 2020 and 2021 where region data is available
        df = df.rename(columns={'Regional indicator': 'Sub-Region'})
        df = df[['Country name', 'Sub-Region', 'Ladder score', 'upperwhisker', 'lowerwhisker',
                 'Explained by: Log GDP per capita', 'Explained by: Social support', 'Explained by: Healthy life expectancy',
                 'Explained by: Freedom to make life choices', 'Explained by: Generosity',
                 'Explained by: Perceptions of corruption', 'Dystopia + residual']]

    df['Year'] = year
    return df

In [None]:
# Process all files and concatenate into a single DataFrame
WHR_df = pd.concat([process_file(year, path) for year, path in files.items()])

Processing file for year: 2024 from /content/WHR2024.csv
Processing file for year: 2023 from /content/WHR2023.csv
Processing file for year: 2022 from /content/WHR2022.csv
Processing file for year: 2021 from /content/WHR2021.csv
Processing file for year: 2020 from /content/WHR2020.csv


In [None]:
# Create a country to region mapping from available years
region_mapping = WHR_df.dropna(subset=['Sub-Region']).drop_duplicates(subset=['Country name'])
region_mapping = pd.Series(region_mapping['Sub-Region'].values, index=region_mapping['Country name']).to_dict()

# Fill missing region values based on the mapping
WHR_df['Sub-Region'] = WHR_df['Country name'].map(region_mapping).fillna(WHR_df['Sub-Region'])

In [None]:
region_mapping = {
    'Western Europe': 'Europe',
    'Middle East and North Africa': 'Middle East',
    'Latin America and Caribbean': 'South America',
    'Central and Eastern Europe': 'Europe',
    'East Asia': 'Asia',
    'Southeast Asia': 'Asia',
    'Commonwealth of Independent States': 'Europe',
    'Sub-Saharan Africa': 'Africa',
    'South Asia': 'Asia',
    'North America and ANZ': 'North America'  # General case
}

In [None]:
# Apply general region mapping
WHR_df['Region'] = WHR_df['Sub-Region'].map(region_mapping)

# Special condition for Australia and New Zealand
WHR_df.loc[WHR_df['Country name'].isin(['Australia', 'New Zealand']), 'Region'] = 'Oceania'

In [None]:
# List of columns for which rankings are needed
columns_to_rank = [
    'Ladder score', 'Explained by: Log GDP per capita', 'Explained by: Social support',
    'Explained by: Healthy life expectancy', 'Explained by: Freedom to make life choices',
    'Explained by: Generosity', 'Explained by: Perceptions of corruption',
    'Dystopia + residual'
]

# Function to convert columns to numeric and add ranking columns
def add_ranking_columns(df):
    for col in columns_to_rank:
        # Ensure the column is numeric, converting non-numeric to NaN
        df[col] = pd.to_numeric(df[col], errors='coerce')

        # Name of the new ranking column
        rank_col_name = f'Rank {col}'

        # Sort df by 'Year' and the current column, compute ranking within each year
        df[rank_col_name] = df.groupby('Year')[col].rank(ascending=False, method='first')
    return df

# Apply the function to add ranking columns to the DataFrame
WHR_df = add_ranking_columns(WHR_df)

In [None]:
# Define the new order of the columns
new_column_order = [
    'Year', 'Region', 'Sub-Region', 'Country name', 'Ladder score', 'Rank Ladder score', 'upperwhisker', 'lowerwhisker',
    'Explained by: Log GDP per capita', 'Explained by: Social support',
    'Explained by: Healthy life expectancy', 'Explained by: Freedom to make life choices',
    'Explained by: Generosity', 'Explained by: Perceptions of corruption', 'Dystopia + residual',
    'Rank Explained by: Log GDP per capita', 'Rank Explained by: Social support',
    'Rank Explained by: Healthy life expectancy', 'Rank Explained by: Freedom to make life choices',
    'Rank Explained by: Generosity', 'Rank Explained by: Perceptions of corruption',
    'Rank Dystopia + residual'
]

# Reorder the DataFrame according to the new column order
WHR_df = WHR_df[new_column_order]

# Rename 'Rank' columns to remove 'Explained by:'
WHR_df.columns = [col.replace('Rank Explained by: ', 'Rank ') for col in WHR_df.columns]

# Verify the changes by printing the DataFrame columns
print(WHR_df.columns)

Index(['Year', 'Region', 'Sub-Region', 'Country name', 'Ladder score',
       'Rank Ladder score', 'upperwhisker', 'lowerwhisker',
       'Explained by: Log GDP per capita', 'Explained by: Social support',
       'Explained by: Healthy life expectancy',
       'Explained by: Freedom to make life choices',
       'Explained by: Generosity', 'Explained by: Perceptions of corruption',
       'Dystopia + residual', 'Rank Log GDP per capita', 'Rank Social support',
       'Rank Healthy life expectancy', 'Rank Freedom to make life choices',
       'Rank Generosity', 'Rank Perceptions of corruption',
       'Rank Dystopia + residual'],
      dtype='object')


In [None]:
WHR_df = WHR_df.rename(columns={'Country name': 'Country Name'})

In [None]:
WHR_df['Country Name'] = WHR_df['Country Name'].str.rstrip('*')

In [None]:
WHR_df = WHR_df[WHR_df['Country Name'] != 'xx']

In [None]:
# Save the DataFrame to a CSV file
WHR_df.to_csv("WHR_2020_2024.csv", index=False)