# Happiness Index Data Cleaning (2015-2019)

This notebook cleans and prepares the Happiness Index data from 2015 to 2019 for analysis. The dataset provides country-level happiness scores and associated factors, which will be used to explore potential relationships between happiness, air quality, asthma prevalence, and other quality of life metrics. The cleaned data will later be integrated with additional datasets to enhance the analysis of global health and environmental factors.


In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

In [5]:
# File paths for Happiness Index data (2015-2019)
file_paths = {
    "2015": "Raw_Data/Happiness 2015.csv",
    "2016": "Raw_Data/Happiness 2016.csv",
    "2017": "Raw_Data/Happiness 2017.csv",
    "2018": "Raw_Data/Happiness 2018.csv",
    "2019": "Raw_Data/Happiness 2019.csv"
}

# Load the data for each year
happiness_data = {year: pd.read_csv(path) for year, path in file_paths.items()}

# Columns to drop per year to retain only Country, Year, Rank, and Score
columns_to_drop = {
    "2015": ['Region', 'Standard Error', 'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)', 
             'Freedom', 'Trust (Government Corruption)', 'Generosity', 'Dystopia Residual'],
    "2016": ['Region', 'Lower Confidence Interval', 'Upper Confidence Interval', 'Economy (GDP per Capita)', 
             'Family', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)', 'Generosity', 'Dystopia Residual'],
    "2017": ['Whisker.high', 'Whisker.low', 'Economy..GDP.per.Capita.', 'Family', 'Health..Life.Expectancy.', 
             'Freedom', 'Generosity', 'Trust..Government.Corruption.', 'Dystopia.Residual'],
    "2018": ['GDP per capita', 'Social support', 'Healthy life expectancy', 
             'Freedom to make life choices', 'Generosity', 'Perceptions of corruption'],
    "2019": ['GDP per capita', 'Social support', 'Healthy life expectancy', 
             'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']
}

# Clean each dataset: drop irrelevant columns and standardize the 'Country' column
for year, df in happiness_data.items():
    df.drop(columns=columns_to_drop[year], inplace=True)
    if 'Country or region' in df.columns:
        df.rename(columns={'Country or region': 'Country'}, inplace=True)

# Rename columns and merge datasets sequentially by year
happiness_data["2015"].rename(columns={'Happiness Rank': 'Rank 2015', 'Happiness Score': 'Score 2015'}, inplace=True)
merged_df = happiness_data["2015"]

# Sequentially merge each year's data and rename columns
for year in ["2016", "2017", "2018", "2019"]:
    rank_col, score_col = f'Rank {year}', f'Score {year}'
    if year in ["2017", "2018", "2019"]:
        happiness_data[year].rename(columns={'Happiness.Rank': rank_col, 'Happiness.Score': score_col, 'Overall rank': rank_col, 'Score': score_col}, inplace=True)
    merged_df = pd.merge(merged_df, happiness_data[year], how='left', on='Country')

# Display the first few rows of the final merged dataset
happiness_index = merged_df
happiness_index.head()


Unnamed: 0,Country,Rank 2015,Score 2015,Happiness Rank,Happiness Score,Rank 2017,Score 2017,Rank 2018,Score 2018,Rank 2019,Score 2019
0,Switzerland,1,7.587,2.0,7.509,4.0,7.494,5.0,7.487,6.0,7.48
1,Iceland,2,7.561,3.0,7.501,3.0,7.504,4.0,7.495,4.0,7.494
2,Denmark,3,7.527,1.0,7.526,2.0,7.522,3.0,7.555,2.0,7.6
3,Norway,4,7.522,4.0,7.498,1.0,7.537,2.0,7.594,3.0,7.554
4,Canada,5,7.427,6.0,7.404,7.0,7.316,7.0,7.328,9.0,7.278


In [6]:
happiness_index.rename(columns={ 'Happiness Rank': 'Rank 2016', 'Happiness Score': 'Score 2016'}, inplace=True)
happiness_index.head()

Unnamed: 0,Country,Rank 2015,Score 2015,Rank 2016,Score 2016,Rank 2017,Score 2017,Rank 2018,Score 2018,Rank 2019,Score 2019
0,Switzerland,1,7.587,2.0,7.509,4.0,7.494,5.0,7.487,6.0,7.48
1,Iceland,2,7.561,3.0,7.501,3.0,7.504,4.0,7.495,4.0,7.494
2,Denmark,3,7.527,1.0,7.526,2.0,7.522,3.0,7.555,2.0,7.6
3,Norway,4,7.522,4.0,7.498,1.0,7.537,2.0,7.594,3.0,7.554
4,Canada,5,7.427,6.0,7.404,7.0,7.316,7.0,7.328,9.0,7.278


In [8]:
# Replace any NaN values with 0 for consistency
happiness_index.fillna(0, inplace=True)

In [10]:
# Convert rank columns to integer type for each year
for year in ['2016', '2017', '2018', '2019']:
    happiness_index[f'Rank {year}'] = happiness_index[f'Rank {year}'].astype(int)

# Preview the updated DataFrame
happiness_index.head()

Unnamed: 0,Country,Rank 2015,Score 2015,Rank 2016,Score 2016,Rank 2017,Score 2017,Rank 2018,Score 2018,Rank 2019,Score 2019
0,Switzerland,1,7.587,2,7.509,4,7.494,5,7.487,6,7.48
1,Iceland,2,7.561,3,7.501,3,7.504,4,7.495,4,7.494
2,Denmark,3,7.527,1,7.526,2,7.522,3,7.555,2,7.6
3,Norway,4,7.522,4,7.498,1,7.537,2,7.594,3,7.554
4,Canada,5,7.427,6,7.404,7,7.316,7,7.328,9,7.278


In [11]:
# Check unique values in 'Score 2017' to identify any anomalies
unique_values = happiness_index['Score 2017'].unique()
print(unique_values)

[7.49399996 7.50400019 7.52199984 7.53700018 7.31599998 7.46899986
 7.37699986 7.28399992 7.31400013 7.21299982 7.079      7.00600004
 6.57800007 6.99300003 6.63500023 6.86299992 6.97700024 6.89099979
 6.64799976 6.71400023 0.         5.25       6.57200003 6.45200014
 6.95100021 6.65199995 6.375      6.44199991 6.59899998 6.60900021
 6.454      6.35699987 6.42399979 6.34399986 6.40299988 6.52699995
 6.10500002 6.16800022 6.00299978 5.97100019 6.09800005 5.92000008
 5.83799982 6.0079999  6.08699989 5.96400023 5.82299995 5.49300003
 5.81899977 5.7579999  5.90199995 6.0710001  5.71500015 5.56899977
 5.97300005 6.08400011 5.29300022 5.5250001  5.96299982 5.31099987
 5.80999994 5.62099981 5.87200022 5.27899981 5.82200003 5.62900019
 5.61100006 5.26200008 5.07399988 5.5        5.00400019 5.01100016
 5.23400021 5.26900005 5.33599997 5.23699999 5.27299976 4.51399994
 5.82499981 5.39499998 5.19500017 5.8499999  5.42999983 5.23500013
 5.17500019 4.55000019 4.64400005 5.18200016 3.80800009 5.2300

In [12]:
# Round 'Score 2017' to three decimal places for uniformity
happiness_index['Score 2017'] = happiness_index['Score 2017'].round(3)

In [13]:
# Export the cleaned dataset to CSV
output_file_path = 'Cleaned_Data/cleaned_happiness_index.csv'
happiness_index.to_csv(output_file_path, index=False)