### IMPORTS

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import math


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_iris
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings('ignore')

### World Happiness Dataset 
Following is the raw dataset for world happiness index as downloaded from WHR. It contains multiple rows for one country for roughly 5-7 years. We are only interested in year 2015. So we will first clean the dataset and use only what we need

In [2]:
world_happiness_df = pd.read_excel('./Datasets/World_Happiness/DataPanelWHR2021C2_1.xls',header=0)

In [17]:
display(world_happiness_df.head(15))

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.72359,7.3701,0.450662,50.799999,0.718114,0.16764,0.881686,0.517637,0.258195
1,Afghanistan,2009,4.401778,7.539972,0.552308,51.200001,0.678896,0.190099,0.850035,0.583926,0.237092
2,Afghanistan,2010,4.758381,7.646709,0.539075,51.599998,0.600127,0.12059,0.706766,0.618265,0.275324
3,Afghanistan,2011,3.831719,7.619532,0.521104,51.919998,0.495901,0.162427,0.731109,0.611387,0.267175
4,Afghanistan,2012,3.782938,7.705479,0.520637,52.240002,0.530935,0.236032,0.77562,0.710385,0.267919
5,Afghanistan,2013,3.5721,7.725029,0.483552,52.560001,0.577955,0.061148,0.823204,0.620585,0.273328
6,Afghanistan,2014,3.130896,7.718354,0.525568,52.880001,0.508514,0.104013,0.871242,0.531691,0.374861
7,Afghanistan,2015,3.982855,7.701992,0.528597,53.200001,0.388928,0.079864,0.880638,0.553553,0.339276
8,Afghanistan,2016,4.220169,7.69656,0.559072,53.0,0.522566,0.042265,0.793246,0.564953,0.348332
9,Afghanistan,2017,2.661718,7.697381,0.49088,52.799999,0.427011,-0.121303,0.954393,0.496349,0.371326


### Trim the dataset to use what we need
We are only interested in the data for year 2015

In [18]:
world_happiness_df_2015 = world_happiness_df[world_happiness_df["year"] == 2015]
display(world_happiness_df_2015.describe())
display(world_happiness_df_2015.info())

Unnamed: 0,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
count,143.0,143.0,140.0,142.0,139.0,140.0,139.0,133.0,142.0,142.0
mean,2015.0,5.404037,9.396412,0.798211,63.665073,0.748704,0.0197,0.736524,0.709288,0.278709
std,0.0,1.116106,1.171639,0.125879,7.16715,0.136429,0.166145,0.195702,0.106159,0.085422
min,2015.0,2.701591,6.934621,0.434389,46.0,0.388928,-0.271978,0.094604,0.36944,0.103494
25%,2015.0,4.614304,8.470333,0.728894,58.25,0.655594,-0.094748,0.673476,0.62542,0.214518
50%,2015.0,5.344383,9.452952,0.825523,65.5,0.775885,-0.02866,0.809943,0.713898,0.27449
75%,2015.0,6.279204,10.34723,0.900397,68.400002,0.850633,0.111643,0.862374,0.800305,0.331707
max,2015.0,7.603434,11.616853,0.987343,75.900002,0.979937,0.68756,0.961651,0.910497,0.642589


<class 'pandas.core.frame.DataFrame'>
Int64Index: 143 entries, 7 to 1943
Data columns (total 11 columns):
Country name                        143 non-null object
year                                143 non-null int64
Life Ladder                         143 non-null float64
Log GDP per capita                  140 non-null float64
Social support                      142 non-null float64
Healthy life expectancy at birth    139 non-null float64
Freedom to make life choices        140 non-null float64
Generosity                          139 non-null float64
Perceptions of corruption           133 non-null float64
Positive affect                     142 non-null float64
Negative affect                     142 non-null float64
dtypes: float64(9), int64(1), object(1)
memory usage: 13.4+ KB


None

We see that a couple columns have null/empty values. Seems like we got some data cleaning do to. A plus point is that all our all our columns (except year) are float types. That will make it easy to fill in the null values

### Let's observe the null values for each column in the dataset focused on year 2015

In [16]:
world_happiness_df_2015.isnull().sum(axis = 0)

Country name                         0
year                                 0
Life Ladder                          0
Log GDP per capita                   3
Social support                       1
Healthy life expectancy at birth     4
Freedom to make life choices         3
Generosity                           4
Perceptions of corruption           10
Positive affect                      1
Negative affect                      1
dtype: int64

### Data Clean

In [12]:
world_happiness_df_2015 = world_happiness_df_2015.dropna()
world_happiness_df_2015.isnull().sum(axis = 0)

Country name                        0
year                                0
Life Ladder                         0
Log GDP per capita                  0
Social support                      0
Healthy life expectancy at birth    0
Freedom to make life choices        0
Generosity                          0
Perceptions of corruption           0
Positive affect                     0
Negative affect                     0
dtype: int64

In [4]:
%run ./Scripts/data_cleaning.py

Hello
