In [1]:
import pandas as pd
import numpy as np
import pycountry

**Generation of a Merged World Happiness Dataset (2016-2019)**

In [61]:
world_happiness_datasets = list()

# Load World Happiness Reports for 2016-2019, and add the years to the datasets
for i in range(2015, 2020):
    df = pd.read_csv(f'./dataset/world_happiness/overall/{i}.csv')
    
    df['Year'] = i
    year_column = df.pop('Year')
    df.insert(3, 'Year', year_column)
    
    world_happiness_datasets.append(df)
    
# Create a dictionary for each set of column names that can be used to make the column names consistent in all datasets
    
column_names_2015_2016 = {
    'Country': 'Country',
    'Year': 'Year',
    'Happiness Rank': 'Happiness rank', 
    'Happiness Score': 'Happiness score',
    'Economy (GDP per Capita)': 'GDP per capita',
    'Family': 'Social support',
    'Health (Life Expectancy)': 'Life expectancy',
    'Freedom': 'Freedom',
    'Trust (Government Corruption)': 'Perceptions of corruption',
    'Generosity': 'Generosity'
}
    
column_names_2017 = {
    'Country': 'Country',
    'Year': 'Year',
    'Happiness.Rank': 'Happiness rank', 
    'Happiness.Score': 'Happiness score',
    'Economy..GDP.per.Capita.': 'GDP per capita',
    'Family': 'Social support',
    'Health..Life.Expectancy.': 'Life expectancy',
    'Freedom': 'Freedom',
    'Trust..Government.Corruption.': 'Perceptions of corruption',
    'Generosity': 'Generosity'
}
    
column_names_2018_2019 = {
    'Country or region': 'Country', 
    'Year': 'Year',
    'Overall rank': 'Happiness rank', 
    'Score': 'Happiness score',
    'GDP per capita': 'GDP per capita',
    'Social support': 'Social support',
    'Healthy life expectancy': 'Life expectancy',
    'Freedom to make life choices': 'Freedom',
    'Perceptions of corruption': 'Perceptions of corruption',
    'Generosity': 'Generosity'
}
    
# Process 2015 and 2016 datasets
for i in range(0, 2):
    world_happiness_datasets[i] = world_happiness_datasets[i][list(column_names_2015_2016.keys())]
    world_happiness_datasets[i] = world_happiness_datasets[i].rename(columns=column_names_2015_2016)
    
# Process 2017 dataset
world_happiness_datasets[2] = world_happiness_datasets[2][list(column_names_2017.keys())]
world_happiness_datasets[2] = world_happiness_datasets[2].rename(columns=column_names_2017)

# Process 2018 and 2019 datasets
for i in range(3, 5):
    world_happiness_datasets[i] = world_happiness_datasets[i][list(column_names_2018_2019.keys())]
    world_happiness_datasets[i] = world_happiness_datasets[i].rename(columns=column_names_2018_2019)

# Combine the happiness data for all years into a single dataset
merged_world_happiness_dataset = pd.concat(world_happiness_datasets, ignore_index=True)

# Sort entries by country names and years in ascending order
merged_world_happiness_dataset = merged_world_happiness_dataset.sort_values(['Country', 'Year']).drop_duplicates().reset_index(drop=True)

merged_world_happiness_dataset

Unnamed: 0,Country,Year,Happiness rank,Happiness score,GDP per capita,Social support,Life expectancy,Freedom,Perceptions of corruption,Generosity
0,Afghanistan,2015,153,3.575,0.319820,0.302850,0.303350,0.234140,0.097190,0.365100
1,Afghanistan,2016,154,3.360,0.382270,0.110370,0.173440,0.164300,0.071120,0.312680
2,Afghanistan,2017,141,3.794,0.401477,0.581543,0.180747,0.106180,0.061158,0.311871
3,Afghanistan,2018,145,3.632,0.332000,0.537000,0.255000,0.085000,0.036000,0.191000
4,Afghanistan,2019,154,3.203,0.350000,0.517000,0.361000,0.000000,0.025000,0.158000
...,...,...,...,...,...,...,...,...,...,...
777,Zimbabwe,2015,115,4.610,0.271000,1.032760,0.334750,0.258610,0.080790,0.189870
778,Zimbabwe,2016,131,4.193,0.350410,0.714780,0.159500,0.254290,0.085820,0.185030
779,Zimbabwe,2017,138,3.875,0.375847,1.083096,0.196764,0.336384,0.095375,0.189143
780,Zimbabwe,2018,144,3.692,0.357000,1.094000,0.248000,0.406000,0.099000,0.132000


In [49]:
def get3countryCode(names):
    code_map = dict()
    
    res = []
    for name in names:
        try:
            try:
                code = code_map[name]
            except KeyError:
                code = pycountry.countries.search_fuzzy(name)[0].alpha_3
                code_map[name] = code
                
            res.append(code)
        except LookupError:
            res.append(None)
    
    return pd.Series(res)

In [64]:
# Add country codes
merged_world_happiness_dataset['Country code'] = get3countryCode(merged_world_happiness_dataset['Country'])

merged_world_happiness_dataset = merged_world_happiness_dataset.dropna()
merged_world_happiness_dataset

Unnamed: 0,Country,Year,Happiness rank,Happiness score,GDP per capita,Social support,Life expectancy,Freedom,Perceptions of corruption,Generosity,Country code
0,Afghanistan,2015,153,3.575,0.319820,0.302850,0.303350,0.234140,0.097190,0.365100,AFG
1,Afghanistan,2016,154,3.360,0.382270,0.110370,0.173440,0.164300,0.071120,0.312680,AFG
2,Afghanistan,2017,141,3.794,0.401477,0.581543,0.180747,0.106180,0.061158,0.311871,AFG
3,Afghanistan,2018,145,3.632,0.332000,0.537000,0.255000,0.085000,0.036000,0.191000,AFG
4,Afghanistan,2019,154,3.203,0.350000,0.517000,0.361000,0.000000,0.025000,0.158000,AFG
...,...,...,...,...,...,...,...,...,...,...,...
777,Zimbabwe,2015,115,4.610,0.271000,1.032760,0.334750,0.258610,0.080790,0.189870,ZWE
778,Zimbabwe,2016,131,4.193,0.350410,0.714780,0.159500,0.254290,0.085820,0.185030,ZWE
779,Zimbabwe,2017,138,3.875,0.375847,1.083096,0.196764,0.336384,0.095375,0.189143,ZWE
780,Zimbabwe,2018,144,3.692,0.357000,1.094000,0.248000,0.406000,0.099000,0.132000,ZWE


**Removal of Developed Countries from Dataset**

In [71]:
# Developed countries as of 2018 according to the IMF
# https://www.imf.org/~/media/Files/Publications/WEO/2018/October/English/main-report/Text.ashx
developed_countries = pd.read_csv(f'./dataset/developed_countries.csv')
developed_countries['Country code'] = get3countryCode(developed_countries['Country'])

developed_countries

Unnamed: 0,Country,Country code
0,Austria,AUT
1,Greece,GRC
2,Netherlands,NLD
3,Belgium,BEL
4,Ireland,IRL
5,Portugal,PRT
6,Cyprus,CYP
7,Italy,ITA
8,Slovak Republic,SVK
9,Estonia,EST


In [75]:
developing_countries = merged_world_happiness_dataset[~merged_world_happiness_dataset['Country code'].isin(
    list(developed_countries['Country code']
))]

developing_countries

Unnamed: 0,Country,Year,Happiness rank,Happiness score,GDP per capita,Social support,Life expectancy,Freedom,Perceptions of corruption,Generosity,Country code
0,Afghanistan,2015,153,3.575,0.319820,0.302850,0.303350,0.234140,0.097190,0.365100,AFG
1,Afghanistan,2016,154,3.360,0.382270,0.110370,0.173440,0.164300,0.071120,0.312680,AFG
2,Afghanistan,2017,141,3.794,0.401477,0.581543,0.180747,0.106180,0.061158,0.311871,AFG
3,Afghanistan,2018,145,3.632,0.332000,0.537000,0.255000,0.085000,0.036000,0.191000,AFG
4,Afghanistan,2019,154,3.203,0.350000,0.517000,0.361000,0.000000,0.025000,0.158000,AFG
...,...,...,...,...,...,...,...,...,...,...,...
777,Zimbabwe,2015,115,4.610,0.271000,1.032760,0.334750,0.258610,0.080790,0.189870,ZWE
778,Zimbabwe,2016,131,4.193,0.350410,0.714780,0.159500,0.254290,0.085820,0.185030,ZWE
779,Zimbabwe,2017,138,3.875,0.375847,1.083096,0.196764,0.336384,0.095375,0.189143,ZWE
780,Zimbabwe,2018,144,3.692,0.357000,1.094000,0.248000,0.406000,0.099000,0.132000,ZWE
