In [1]:
import pandas as pd
import numpy as np
import pycountry
import os

In [2]:
# Import dataframe column names and helper functions
%run -i columns.py
%run -i helper_functions.py

**Generation of a Merged World Happiness Dataset (2016-2019)**

In [3]:
# Create a dictionary for each set of column names that can be used to make the column names consistent in all datasets
    
column_names_2015_2016 = {
    'Country': COUNTRY,
    'Year': YEAR,
    'Happiness Rank': HAPPINESS_RANK, 
    'Happiness Score': HAPPINESS_SCORE,
    'Economy (GDP per Capita)': GDP_PER_CAPITA,
    'Family': SOCIAL_SUPPORT,
    'Health (Life Expectancy)': HEALTH,
    'Freedom': FREEDOM,
    'Trust (Government Corruption)': CORRUPTION,
    'Generosity': GENEROSITY,
    'Dystopia Residual': DYSTOPIA_RESIDUAL
}
    
column_names_2017 = {
    'Country': COUNTRY,
    'Year': YEAR,
    'Happiness.Rank': HAPPINESS_RANK, 
    'Happiness.Score': HAPPINESS_SCORE,
    'Economy..GDP.per.Capita.': GDP_PER_CAPITA,
    'Family': SOCIAL_SUPPORT,
    'Health..Life.Expectancy.': HEALTH,
    'Freedom': FREEDOM,
    'Trust..Government.Corruption.': CORRUPTION,
    'Generosity': GENEROSITY,
    'Dystopia.Residual': DYSTOPIA_RESIDUAL
}
    
column_names_2018_2019 = {
    'Country or region': COUNTRY, 
    'Year': YEAR,
    'Overall rank': HAPPINESS_RANK, 
    'Score': HAPPINESS_SCORE,
    'GDP per capita': GDP_PER_CAPITA,
    'Social support': SOCIAL_SUPPORT,
    'Healthy life expectancy': HEALTH,
    'Freedom to make life choices': FREEDOM,
    'Perceptions of corruption': CORRUPTION,
    'Generosity': GENEROSITY,
}

In [4]:
world_happiness_datasets = list()

# Load World Happiness Reports for 2016-2019, and add the years to the datasets
for i in range(2015, 2020):
    df = pd.read_csv(f'./dataset/world_happiness/overall/{i}.csv')
    
    df['Year'] = i
    year_column = df.pop('Year')
    df.insert(3, 'Year', year_column)
    
    world_happiness_datasets.append(df)
    
# Process 2015 and 2016 datasets
for i in range(0, 2):
    world_happiness_datasets[i] = world_happiness_datasets[i][list(column_names_2015_2016.keys())]
    world_happiness_datasets[i] = world_happiness_datasets[i].rename(columns=column_names_2015_2016)
    
# Process 2017 dataset
world_happiness_datasets[2] = world_happiness_datasets[2][list(column_names_2017.keys())]
world_happiness_datasets[2] = world_happiness_datasets[2].rename(columns=column_names_2017)

# Process 2018 and 2019 datasets
for i in range(3, 5):
    world_happiness_datasets[i] = world_happiness_datasets[i][list(column_names_2018_2019.keys())]
    world_happiness_datasets[i] = world_happiness_datasets[i].rename(columns=column_names_2018_2019)
    
    # Calculate dystopia residual for all countries
    world_happiness_datasets[i][DYSTOPIA_RESIDUAL] = world_happiness_datasets[i][HAPPINESS_SCORE] \
    - world_happiness_datasets[i][GDP_PER_CAPITA] \
    - world_happiness_datasets[i][SOCIAL_SUPPORT] - world_happiness_datasets[i][HEALTH] \
    - world_happiness_datasets[i][FREEDOM] - world_happiness_datasets[i][CORRUPTION] \
    - world_happiness_datasets[i][GENEROSITY]

# Combine the happiness data for all years into a single dataset
merged_world_happiness_dataset = pd.concat(world_happiness_datasets, ignore_index=True)

# Sort entries by country names and years in ascending order
merged_world_happiness_dataset = merged_world_happiness_dataset.sort_values(['Country', 'Year']).drop_duplicates().reset_index(drop=True)

In [5]:
# Add country codes
merged_world_happiness_dataset['Country code'] = get3countryCode(merged_world_happiness_dataset['Country'])

merged_world_happiness_dataset = merged_world_happiness_dataset.dropna()

# Save the dataset in a CSV file
merged_world_happiness_dataset.to_csv('tmp/wh_all.csv', index=False)

**Removal of Developed Countries from Dataset**

In [6]:
# Developed countries as of 2018 according to the IMF
# https://www.imf.org/~/media/Files/Publications/WEO/2018/October/English/main-report/Text.ashx
developed_countries = pd.read_csv(f'./dataset/developed_countries.csv')
developed_countries['Country code'] = get3countryCode(developed_countries['Country'])

In [7]:
# Exclude all developed countries from the dataset that contains developing countries
developing_countries_df = merged_world_happiness_dataset[~merged_world_happiness_dataset['Country code'].isin(
    list(developed_countries['Country code']
))]

**Generation of a Simplified SDG Dataset with Positive and Negative Indicators of Urbanization Only**

In [8]:
# Column names of the dataset

SDG_COLUMNS = [
    'Country Code',
    'Indicator Name',
    '2014',
    '2015',
    '2016',
    '2017',
    '2018',
]

In [9]:
# Load SDG dataset and extract the relevant columns from it
df = pd.read_csv(f'./dataset/world_happiness/sustainable_develop_goals/sdg-csv-zip-7-mb-/SDGData.csv')
df = df[SDG_COLUMNS]

In [10]:
# Extract the rows containing the selected positive and negative indicators of urbanization
sdg_dataset = df[df['Indicator Name'].isin(URBANIZATION_INDICATORS + ANTI_URBANIZATION_INDICATORS)]

In [11]:
# Tidy the final sustainable development goals dataset such that the years are column values, rather than columns
sdg_dataset = pd.wide_to_long(
    sdg_dataset,
    stubnames=[''],
    sep='',
    i=['Country Code', 'Indicator Name'],
    j='Year'
).dropna().reset_index().rename(columns={'': 'Indicator Value'})

In [12]:
# Create a column for each indicator
tidied_sdg_df = pd.pivot_table(
    sdg_dataset, 
    index=['Country Code', 'Year'], 
    columns=['Indicator Name'], 
    values='Indicator Value'
).reset_index().rename(columns={'Country Code': 'Country code'})

**Merging the World Happiness Report and Sustainable Development Goals Datasets**

In [13]:
# Developing countries
developing_countries_df = developing_countries_df.merge(tidied_sdg_df, on=['Country code', 'Year'])
country_codes = developing_countries_df.pop('Country code')
developing_countries_df.insert(1, 'Country code', country_codes)

# Display the final dataset
developing_countries_df

Unnamed: 0,Country,Country code,Year,Happiness rank,Happiness score,GDP per capita,Social support,Life expectancy,Freedom,Perceptions of corruption,...,Employment in industry (% of total employment) (modeled ILO estimate),Employment in services (% of total employment) (modeled ILO estimate),Forest area (% of land area),Individuals using the Internet (% of population),"Industry (including construction), value added per worker (constant 2010 US$)","Manufacturing, value added (% of GDP)",Medium and high-tech industry (% manufacturing value added),"Railways, goods transported (million ton-km)","Railways, passengers carried (million passenger-km)",Urban population (% of total)
0,Afghanistan,AFG,2015,153,3.575,0.319820,0.302850,0.303350,0.234140,0.097190,...,17.646999,43.674999,2.067825,8.260000,2079.555916,11.420006,9.507434,,,24.803
1,Afghanistan,AFG,2016,154,3.360,0.382270,0.110370,0.173440,0.164300,0.071120,...,17.424999,43.756001,2.067825,10.595726,1999.566153,11.370465,,,,25.020
2,Afghanistan,AFG,2017,141,3.794,0.401477,0.581543,0.180747,0.106180,0.061158,...,17.457001,43.768002,,11.447688,1923.393447,11.102526,,,,25.250
3,Afghanistan,AFG,2018,145,3.632,0.332000,0.537000,0.255000,0.085000,0.036000,...,17.596001,43.816002,,,,,,,,
4,Albania,ALB,2015,95,4.959,0.878670,0.804340,0.813250,0.357330,0.064130,...,18.617001,40.020000,28.156934,63.252933,13648.254549,5.671519,6.690696,,,57.434
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452,Zambia,ZMB,2018,125,4.377,0.562000,1.047000,0.295000,0.503000,0.082000,...,10.683000,35.401001,,,,,,,,
453,Zimbabwe,ZWE,2015,115,4.610,0.271000,1.032760,0.334750,0.258610,0.080790,...,7.290000,25.589001,36.350006,22.742818,5962.002633,11.888599,,,,32.385
454,Zimbabwe,ZWE,2016,131,4.193,0.350410,0.714780,0.159500,0.254290,0.085820,...,7.218000,25.590000,35.542457,23.119989,5930.642813,11.596020,,,,32.296
455,Zimbabwe,ZWE,2017,138,3.875,0.375847,1.083096,0.196764,0.336384,0.095375,...,7.254000,25.673000,,27.055488,5866.001706,11.017009,,,,32.237


In [14]:
# Save the final dataset as a CSV file
if not os.path.exists('tmp'):
    os.makedirs('tmp')

developing_countries_df.to_csv('tmp/developing_countries.csv', index=False)