# Import Libraries

In [35]:
import pandas as pd
from sklearn.impute import KNNImputer

# Obtain Data

Read csv with happiness data into Pandas

In [36]:
df = pd.read_csv('Data/world-happiness-report.csv')
df.head()

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.724,7.37,0.451,50.8,0.718,0.168,0.882,0.518,0.258
1,Afghanistan,2009,4.402,7.54,0.552,51.2,0.679,0.19,0.85,0.584,0.237
2,Afghanistan,2010,4.758,7.647,0.539,51.6,0.6,0.121,0.707,0.618,0.275
3,Afghanistan,2011,3.832,7.62,0.521,51.92,0.496,0.162,0.731,0.611,0.267
4,Afghanistan,2012,3.783,7.705,0.521,52.24,0.531,0.236,0.776,0.71,0.268


# Scrub Data

Calculate number of datapoints by year in order to determine which year to use for modeling

In [37]:
df.groupby('year').count()['Country name']

year
2005     27
2006     89
2007    102
2008    110
2009    114
2010    124
2011    146
2012    142
2013    137
2014    145
2015    143
2016    142
2017    147
2018    142
2019    144
2020     95
Name: Country name, dtype: int64

Use data for 2017 in modeling and drop data for all other years from the df

In [38]:
df = df[df['year']==2017]
df.reset_index(drop=True, inplace=True)
df.drop(columns='year', inplace=True)
df.head()

Unnamed: 0,Country name,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2.662,7.697,0.491,52.8,0.427,-0.121,0.954,0.496,0.371
1,Albania,4.64,9.476,0.638,68.4,0.75,-0.029,0.876,0.669,0.334
2,Algeria,5.249,9.354,0.807,65.7,0.437,-0.167,0.7,0.642,0.289
3,Argentina,6.039,10.067,0.907,68.6,0.832,-0.186,0.841,0.809,0.292
4,Armenia,4.288,9.402,0.698,66.6,0.614,-0.147,0.865,0.625,0.437


use df.info() to determine what scrubbing needed

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 10 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      147 non-null    object 
 1   Life Ladder                       147 non-null    float64
 2   Log GDP per capita                146 non-null    float64
 3   Social support                    146 non-null    float64
 4   Healthy life expectancy at birth  143 non-null    float64
 5   Freedom to make life choices      146 non-null    float64
 6   Generosity                        145 non-null    float64
 7   Perceptions of corruption         137 non-null    float64
 8   Positive affect                   146 non-null    float64
 9   Negative affect                   146 non-null    float64
dtypes: float64(9), object(1)
memory usage: 11.6+ KB


Drop object variables and other variables not useful for modeling

In [41]:
df.drop(columns=['Country name', 'Positive affect', 'Negative affect'], inplace=True)
df.head()

Unnamed: 0,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption
0,2.662,7.697,0.491,52.8,0.427,-0.121,0.954
1,4.64,9.476,0.638,68.4,0.75,-0.029,0.876
2,5.249,9.354,0.807,65.7,0.437,-0.167,0.7
3,6.039,10.067,0.907,68.6,0.832,-0.186,0.841
4,4.288,9.402,0.698,66.6,0.614,-0.147,0.865


Replace missing values with knn imputer

In [57]:
x = df.drop(columns='Life Ladder')
knn_imputer = KNNImputer()
imputed_x = pd.DataFrame(knn_imputer.fit_transform(x), columns=x.columns)
scrubbed_df = pd.concat([df[['Life Ladder']], imputed_x], axis=1)
scrubbed_df.head()

Unnamed: 0,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption
0,2.662,7.697,0.491,52.8,0.427,-0.121,0.954
1,4.64,9.476,0.638,68.4,0.75,-0.029,0.876
2,5.249,9.354,0.807,65.7,0.437,-0.167,0.7
3,6.039,10.067,0.907,68.6,0.832,-0.186,0.841
4,4.288,9.402,0.698,66.6,0.614,-0.147,0.865


Confirm data is now scrubbed with df.info()

In [58]:
scrubbed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 7 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Life Ladder                       147 non-null    float64
 1   Log GDP per capita                147 non-null    float64
 2   Social support                    147 non-null    float64
 3   Healthy life expectancy at birth  147 non-null    float64
 4   Freedom to make life choices      147 non-null    float64
 5   Generosity                        147 non-null    float64
 6   Perceptions of corruption         147 non-null    float64
dtypes: float64(7)
memory usage: 8.2 KB


# Explore Data

Use isolation forest to identify outliers

In [None]:
x = df.drop(columns=)