In [1]:
# Install a pip package in the current Jupyter kernel
#import sys
#!{sys.executable} -m pip install pandas

In [2]:
import glob
import pandas as pd
import os

In [3]:
path = r'C:\Users\pbrown\Documents\Python\Raw Data\Happiness Data'

In [4]:
all_csv = glob.glob(path + "\*.csv")

In [69]:
df_list = []

for filename in all_csv:
    df = pd.read_csv(filename, index_col=None, header=0)
    df['Year'] = os.path.basename(filename)
    #add the year to the file being appended (filename is the year)
    df['Year'] = df['Year'].str[:4]
    df_list.append(df)

### Grab the list of dataframes and sort through the mismatching column headers

In [70]:
#year 2015 and 2016 are similar with the exception of unnecessary columns

df_1516 = pd.concat(df_list[0:2], sort=False, axis=0)
df_1516 = df_1516.drop(['Standard Error','Lower Confidence Interval','Upper Confidence Interval','Happiness Rank'], axis=1)

In [71]:
#year 2018 and 2019 are similar with the exception Overall Rank. Also, change column names to match the other years

df_1819 = pd.concat(df_list[3:5], sort=False, axis=0).rename(columns={'Country or region':'Country',
                                                                      'Score':'Happiness Score',
                                                                     'GDP per capita':'Economy (GDP per Capita)',
                                                                     'Freedom to make life choices':'Freedom',
                                                                     'Perceptions of corruption':'Trust (Government Corruption)',
                                                                     'Healthy life expectancy':'Health (Life Expectancy)'})
df_1819 = df_1819.drop(['Overall rank'], axis=1)

In [72]:
#year 2017 is disgusting, you basically have to change all the column headers and remove useless whisker stats

df_17 = df_list[2]
df_17 = df_17.drop(['Happiness.Rank','Whisker.high','Whisker.low'], axis=1)

column_map = {'Happiness.Score':'Happiness Score', 
 'Economy..GDP.per.Capita.':'Economy (GDP per Capita)', 
 'Health..Life.Expectancy.':'Health (Life Expectancy)',
 'Trust..Government.Corruption.':'Trust (Government Corruption)',
 'Dystopia.Residual':'Dystopia Residual'
}
df_17 = df_17.rename(columns=column_map)

In [73]:
#bring 15, 16, 17 together and change the column name 'Family to Social support'

df_151617 = pd.concat([df_1516, df_17], sort=False)
df_151617 = df_151617.rename(columns={'Family':'Social support'})

In [82]:
master_df = pd.concat([df_1819, df_151617], sort=False, ignore_index=True)
master_df = master_df.drop(['Region','Dystopia Residual'], axis=1)

In [83]:
#Overall Rank
master_df['Overall Rank'] = master_df['Happiness Score'].rank(ascending=False, method='max').astype(int)

#Rank within Year
master_df['Yearly Rank'] = master_df.sort_values(by=['Year','Happiness Score'], ascending=False)['Happiness Score'].index + 1

### Now you have a dataframe of happiness data, Hurray!
What are you going to do with it? Maybe we could add:
* Region Classification (https://meta.wikimedia.org/wiki/List_of_countries_by_regional_classification)

* Income Inequality (https://en.wikipedia.org/wiki/List_of_countries_by_income_equality)

* Household Debt (https://en.wikipedia.org/wiki/List_of_countries_by_household_debt)

* Economic Freedom (https://en.wikipedia.org/wiki/List_of_countries_by_economic_freedom)

* Unemployment Rate (https://en.wikipedia.org/wiki/List_of_countries_by_unemployment_rate)

* Development Aid Donotions (https://en.wikipedia.org/wiki/List_of_development_aid_country_donors)

* Suicide Rate (https://en.wikipedia.org/wiki/List_of_countries_by_suicide_rate)

* Life Expectancy (https://en.wikipedia.org/wiki/List_of_countries_by_life_expectancy)

* Intentional Homicide Rate (https://en.wikipedia.org/wiki/List_of_countries_by_intentional_homicide_rate)

* Dependency Ratio (https://en.wikipedia.org/wiki/List_of_countries_by_intentional_homicide_rate)

* Education Index (https://en.wikipedia.org/wiki/Education_Index)

* Number of Internet Users (https://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users)

In [84]:
master_df.sort_values(by=['Year','Happiness Score'], ascending=False).head(10)

Unnamed: 0,Country,Happiness Score,Economy (GDP per Capita),Social support,Health (Life Expectancy),Freedom,Generosity,Trust (Government Corruption),Year,Overall Rank,Yearly Rank
156,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393,2019,1,1
157,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41,2019,3,2
158,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341,2019,8,3
159,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118,2019,19,4
160,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298,2019,21,5
161,Switzerland,7.48,1.452,1.526,1.052,0.572,0.263,0.343,2019,23,6
162,Sweden,7.343,1.387,1.487,1.009,0.574,0.267,0.373,2019,33,7
163,New Zealand,7.307,1.303,1.557,1.026,0.585,0.33,0.38,2019,42,8
164,Canada,7.278,1.365,1.505,1.039,0.584,0.285,0.308,2019,49,9
165,Austria,7.246,1.376,1.475,1.016,0.532,0.244,0.226,2019,52,10


## Regional Labels

In [85]:
regional = pd.read_html('https://meta.wikimedia.org/wiki/List_of_countries_by_regional_classification')[0]
regional = regional.drop(['Global South'], axis=1)

In [86]:
print('There are {} records for countries and there are {} nulls'.
      format(len(master_df.Country),master_df.Country.isnull().sum()))

There are 782 records for countries and there are 0 nulls


In [87]:
master_df = master_df.merge(regional, on='Country', how='left')
print('There are {} values that we were unable to join'.format(master_df['Region'].isnull().sum()))

There are 71 values that we were unable to join


In [101]:
#These are the countries that we were unable to get a region for
miss_regions = master_df[master_df['Region'].isnull()].groupby('Country').first().reset_index()
miss_regions = miss_regions['Country'].tolist()

## Income Equality

In [138]:
income_equality = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_by_income_equality')[2]

In [139]:
income_equality.columns = income_equality.columns.droplevel()
income_equality.columns = income_equality.columns.droplevel()
#all I want is the country and the Gini Index (quantified representation of the Lorenz curve, duh!)
income_equality = income_equality.iloc[:,[0,3]]
income_equality.columns = ['Country','Gini Score']

In [140]:
master_df = master_df.merge(income_equality, on='Country', how='left')
print('There are {} values that we were unable to join'.format(master_df['Gini Score'].isnull().sum()))

There are 92 values that we were unable to join


## Suicide Rate

In [None]:
suicide_df = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_by_income_equality')[2]