In [2]:
import pandas as pd

## Import datasets from 2015 to 2019

In [4]:
df15 = pd.read_csv('../data/raw/2015.csv')
# df15

In [5]:
df16 = pd.read_csv('../data/raw/2016.csv')
# df16

In [6]:
df17 = pd.read_csv('../data/raw/2017.csv')
# df17

In [7]:
df18 = pd.read_csv('../data/raw/2018.csv')
# df18

In [8]:
df19 = pd.read_csv('../data/raw/2019.csv')
# df19

In [9]:
# Standardize column names
def standardize_col_names(df):
    df.columns = df.columns.str.strip()
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(' ', '_') 
    df.columns = df.columns.str.replace('.', '_')
    return df

# Check unique values in each column
def check_unique_vals(df):
    for column in df.columns:
        print(f"Unique values in '{column}':")
        print(df[column].unique())
        print()

## 2015 dataset

In [11]:
standardize_col_names(df15)
df15 = df15.drop(columns=['standard_error', 'family', 'dystopia_residual'])
df15.rename(columns={'happiness_rank': 'rank', 'happiness_score': 'score', 'economy_(gdp_per_capita)':'gdp', 'social_support':'support', 'health_(life_expectancy)':'life_expectancy','freedom_to_make_life_choices':'freedom','trust_(government_corruption)':'corruption'}, inplace=True)
df15
# check datatypes
print('\ndata types\n')
print(df15.dtypes)


data types

country             object
region              object
rank                 int64
score              float64
gdp                float64
life_expectancy    float64
freedom            float64
corruption         float64
generosity         float64
dtype: object


## 2016 dataset

In [13]:
standardize_col_names(df16)
df16 = df16.drop(columns=['lower_confidence_interval', 'upper_confidence_interval', 'family', 'dystopia_residual'])
df16.rename(columns={'happiness_rank': 'rank', 'happiness_score': 'score', 'economy_(gdp_per_capita)':'gdp', 'social_support':'support', 'health_(life_expectancy)':'life_expectancy','freedom_to_make_life_choices':'freedom','trust_(government_corruption)':'corruption'}, inplace=True)
df16
# check datatypes
print('\ndata types\n')
print(df16.dtypes)


data types

country             object
region              object
rank                 int64
score              float64
gdp                float64
life_expectancy    float64
freedom            float64
corruption         float64
generosity         float64
dtype: object


## 2017

In [15]:
standardize_col_names(df17)
df17 = df17.drop(columns=['whisker_high', 'whisker_low', 'family', 'dystopia_residual'])
df17.rename(columns={'happiness_rank': 'rank', 'happiness_score': 'score', 'economy__gdp_per_capita_':'gdp', 'social_support':'support', 'health__life_expectancy_':'life_expectancy','freedom_to_make_life_choices':'freedom','trust__government_corruption_':'corruption'}, inplace=True)
df17
# check datatypes
print('\ndata types\n')
print(df17.dtypes)


data types

country             object
rank                 int64
score              float64
gdp                float64
life_expectancy    float64
freedom            float64
generosity         float64
corruption         float64
dtype: object


## 2018

In [17]:
standardize_col_names(df18)
df18.rename(columns={'overall_rank': 'rank', 'country_or_region': 'country', 'gdp_per_capita':'gdp', 'social_support':'support', 'healthy_life_expectancy':'life_expectancy','freedom_to_make_life_choices':'freedom','perceptions_of_corruption':'corruption'}, inplace=True)
df18
# check datatypes
print('\ndata types\n')
print(df18.dtypes)


data types

rank                 int64
country             object
score              float64
gdp                float64
support            float64
life_expectancy    float64
freedom            float64
generosity         float64
corruption         float64
dtype: object


## 2019

In [19]:
# clean column names
standardize_col_names(df19)
df19.rename(columns={'overall_rank': 'rank', 'country_or_region': 'country', 'gdp_per_capita':'gdp', 'social_support':'support', 'healthy_life_expectancy':'life_expectancy','freedom_to_make_life_choices':'freedom','perceptions_of_corruption':'corruption'}, inplace=True)
df19

# check datatypes
print('\ndata types\n')
print(df19.dtypes)


data types

rank                 int64
country             object
score              float64
gdp                float64
support            float64
life_expectancy    float64
freedom            float64
generosity         float64
corruption         float64
dtype: object


### Some conclusions
- 2018 and 2019 have data on social support ('support' column)
- not all datasets have the 'region' information -> can be found, but we first need to decide whether it is important
- 

## Concatenate datasets vertically

In [39]:
df_concat = pd.concat([df15, df16, df17, df18, df19], ignore_index=True)
df_concat = df_concat.round(3)
df_concat

Unnamed: 0,country,region,rank,score,gdp,life_expectancy,freedom,corruption,generosity,support
0,Switzerland,Western Europe,1,7.587,1.397,0.941,0.666,0.420,0.297,
1,Iceland,Western Europe,2,7.561,1.302,0.948,0.629,0.141,0.436,
2,Denmark,Western Europe,3,7.527,1.325,0.875,0.649,0.484,0.341,
3,Norway,Western Europe,4,7.522,1.459,0.885,0.670,0.365,0.347,
4,Canada,North America,5,7.427,1.326,0.906,0.633,0.330,0.458,
...,...,...,...,...,...,...,...,...,...,...
777,Rwanda,,152,3.334,0.359,0.614,0.555,0.411,0.217,0.711
778,Tanzania,,153,3.231,0.476,0.499,0.417,0.147,0.276,0.885
779,Afghanistan,,154,3.203,0.350,0.361,0.000,0.025,0.158,0.517
780,Central African Republic,,155,3.083,0.026,0.105,0.225,0.035,0.235,0.000


In [47]:
df_concat['country'].unique().tolist()

['Switzerland',
 'Iceland',
 'Denmark',
 'Norway',
 'Canada',
 'Finland',
 'Netherlands',
 'Sweden',
 'New Zealand',
 'Australia',
 'Israel',
 'Costa Rica',
 'Austria',
 'Mexico',
 'United States',
 'Brazil',
 'Luxembourg',
 'Ireland',
 'Belgium',
 'United Arab Emirates',
 'United Kingdom',
 'Oman',
 'Venezuela',
 'Singapore',
 'Panama',
 'Germany',
 'Chile',
 'Qatar',
 'France',
 'Argentina',
 'Czech Republic',
 'Uruguay',
 'Colombia',
 'Thailand',
 'Saudi Arabia',
 'Spain',
 'Malta',
 'Taiwan',
 'Kuwait',
 'Suriname',
 'Trinidad and Tobago',
 'El Salvador',
 'Guatemala',
 'Uzbekistan',
 'Slovakia',
 'Japan',
 'South Korea',
 'Ecuador',
 'Bahrain',
 'Italy',
 'Bolivia',
 'Moldova',
 'Paraguay',
 'Kazakhstan',
 'Slovenia',
 'Lithuania',
 'Nicaragua',
 'Peru',
 'Belarus',
 'Poland',
 'Malaysia',
 'Croatia',
 'Libya',
 'Russia',
 'Jamaica',
 'North Cyprus',
 'Cyprus',
 'Algeria',
 'Kosovo',
 'Turkmenistan',
 'Mauritius',
 'Hong Kong',
 'Estonia',
 'Indonesia',
 'Vietnam',
 'Turkey',
 'Ky

In [51]:
replacement_dict = {
    'Palestinian Territories':'Palestine',
    'Congo (Brazzaville)':'Republic of the Congo',
    'Congo (Kinshasa)':'Democratic Republic of the Congo',
    'Taiwan Province of China':'Taiwan',
    'Hong Kong S.A.R., China':'Hong Kong',
    'Trinidad & Tobago':'Trinidad and Tobago',
    'Swaziland': 'Eswatini',
    'Somaliland region':'Somaliland'
}

In [53]:
df_concat['country'] = df_concat['country'].replace(replacement_dict)

In [59]:
df_concat

Unnamed: 0,country,region,rank,score,gdp,life_expectancy,freedom,corruption,generosity,support
0,Switzerland,Western Europe,1,7.587,1.397,0.941,0.666,0.420,0.297,
1,Iceland,Western Europe,2,7.561,1.302,0.948,0.629,0.141,0.436,
2,Denmark,Western Europe,3,7.527,1.325,0.875,0.649,0.484,0.341,
3,Norway,Western Europe,4,7.522,1.459,0.885,0.670,0.365,0.347,
4,Canada,North America,5,7.427,1.326,0.906,0.633,0.330,0.458,
...,...,...,...,...,...,...,...,...,...,...
777,Rwanda,,152,3.334,0.359,0.614,0.555,0.411,0.217,0.711
778,Tanzania,,153,3.231,0.476,0.499,0.417,0.147,0.276,0.885
779,Afghanistan,,154,3.203,0.350,0.361,0.000,0.025,0.158,0.517
780,Central African Republic,,155,3.083,0.026,0.105,0.225,0.035,0.235,0.000


Potential hypotheses
- Correlation between happiness and GDP
- Correlation between life expectancy and something else (for example)