In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics
import os
import statsmodels.formula.api as stats
from statsmodels.formula.api import ols
import sklearn
from sklearn import linear_model, datasets
from sklearn.metrics import mean_squared_error
from plotly import __version__
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
from sklearn.cluster import KMeans

# Objective: Determine if natural hazard is the cause of the low happiness score of Haiti

# Data Cleaning

In [2]:
# 2019 Happiness report
df2019_1=pd.read_excel("WHR2019Chapter2OnlineData.xls", sheet_name='Table2.1')
df2019_6=pd.read_excel("WHR2019Chapter2OnlineData.xls", sheet_name='Figure2.6')

# 2018 Happiness report
df2018_2=pd.read_excel("WHR2018Chapter2OnlineData.xls", sheet_name='Figure2.2')
# 2017 Happiness report
df2017_2=pd.read_excel("WHR2017Chapter2OnlineData.xlsx", sheet_name='Figure2.2 WHR 2017')
# 2016 Happiness report
df2016_2=pd.read_excel("WHR2016Chapter2OnlineData.xlsx", sheet_name='Figure2.2')

In [3]:
# Clean 2019 data by filling null value with 0
df2019_1.fillna(0, inplace=True)
# Change the names of the columns
df2019_1.rename(columns={'GINI index':'Ecomomy',\
                      'Healthy life expectancy at birth' : 'Health',\
                      'Confidence in national government ':'Trust', \
                      'Explained by: Generosity':'Generosity',\
                      'Freedom to make life choices':'Freedom',\
                      'Country name':'Country',\
                      'Social Support':'Family',\
                      'Perceptions of corruption':'Corruption'}, inplace=True)


In [4]:
df2019_6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 26 columns):
Country name                                                1704 non-null object
Year                                                        1704 non-null int64
Life Ladder                                                 1704 non-null float64
Log GDP per capita                                          1676 non-null float64
Social support                                              1691 non-null float64
Healthy life expectancy at birth                            1676 non-null float64
Freedom to make life choices                                1675 non-null float64
Generosity                                                  1622 non-null float64
Perceptions of corruption                                   1608 non-null float64
Positive affect                                             1685 non-null float64
Negative affect                                             1691 non-null fl

In [12]:
df2019_6.head()

Unnamed: 0,Country name,Year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,...,GINI index (World Bank estimate),"GINI index (World Bank estimate), average 2000-16","gini of household income reported in Gallup, by wp5-year","Most people can be trusted, Gallup","Most people can be trusted, WVS round 1981-1984","Most people can be trusted, WVS round 1989-1993","Most people can be trusted, WVS round 1994-1998","Most people can be trusted, WVS round 1999-2004","Most people can be trusted, WVS round 2005-2009","Most people can be trusted, WVS round 2010-2014"
0,Afghanistan,2008,3.72359,7.16869,0.450662,50.799999,0.718114,0.177889,0.881686,0.517637,...,,,,,,,,,,
1,Afghanistan,2009,4.401778,7.33379,0.552308,51.200001,0.678896,0.200178,0.850035,0.583926,...,,,0.441906,0.286315,,,,,,
2,Afghanistan,2010,4.758381,7.386629,0.539075,51.599998,0.600127,0.134353,0.706766,0.618265,...,,,0.327318,0.275833,,,,,,
3,Afghanistan,2011,3.831719,7.415019,0.521104,51.919998,0.495901,0.172137,0.731109,0.611387,...,,,0.336764,,,,,,,
4,Afghanistan,2012,3.782938,7.517126,0.520637,52.240002,0.530935,0.244273,0.77562,0.710385,...,,,0.34454,,,,,,,


In [15]:
# Clean Happiness score data of 2019
df2019_6=df2019_6[['Country name', 'Happiness score','Whisker-high', 'Whisker-low', 'Dystopia (1.88) + residual', \
                          'Explained by: GDP per capita', 'Explained by: Social support', \
                          'Explained by: Freedom to make life choices', 'Explained by: Healthy life expectancy', \
                         'Explained by: Generosity', 'Explained by: Perceptions of corruption']]
df2019_6.isnull().any()

KeyError: "['Happiness score' 'Whisker-high' 'Whisker-low'\n 'Dystopia (1.88) + residual' 'Explained by: GDP per capita'\n 'Explained by: Social support'\n 'Explained by: Freedom to make life choices'\n 'Explained by: Healthy life expectancy' 'Explained by: Generosity'\n 'Explained by: Perceptions of corruption'] not in index"

In [6]:
Haiti=pd.DataFrame(df2019_1[df2019_1['Country']=='Haiti'])
Haiti.set_index('Year', inplace=True)
y=Haiti[['Log GDP per capita','Social support', 'Health', 'Freedom', 'Generosity','Corruption']]
axes = y.plot.line(subplots=True, figsize=(12,10))

In [7]:
Hazard=pd.read_excel('INFORM_2019.xlsx',sheet_name='Hazard & Exposure')
data = dict (
    type = 'choropleth',
    locations = Hazard['COUNTRY'],
    locationmode='country names',
    colorscale = 'Viridis',
    z=Hazard['INFORM Natural Hazard'],)
lyt = dict(geo=dict(scope='north america'), title = '2019 Hazard Index')
map = go.Figure(data=[data], layout = lyt)
py.iplot(map)

KeyError: 'COUNTRY'

Comment: Most countries around Haiti have a relatively high hazard index

# Natural Disaster Data 

In [8]:
df2=pd.read_excel('INFORM_2016.xlsx', sheet_name='Indicator Data')
df1=pd.read_excel('INFORM_2017.xlsx', sheet_name='Indicator Data')
df=pd.read_excel('INFORM_2019.xlsx',sheet_name='Indicator Data')
# Extract the haiti data
haiti_2016=pd.DataFrame(df2[df2['COUNTRY']=='Haiti'])
haiti_2017=pd.DataFrame(df1[df1['COUNTRY']=='Haiti'])
haiti_2019=pd.DataFrame(df[df['COUNTRY']=='Haiti'])

KeyError: 'COUNTRY'

In [None]:
# Extract the data on the number of people affected by natural disaster in Haiti in 2014
ND2=pd.DataFrame(haiti_2016[['People affected by Natural Disasters.1']])
ND2.rename(index={72:'Number of people affected by Natural Disasters in Haiti'}, inplace=True)
ND2.rename(columns={'People affected by Natural Disasters.1':'2014'}, \
           inplace=True)
ND2=ND2.transpose()

# Extract the data on the number of people affected by natural disaster in Haiti in 2015
ND1=pd.DataFrame(haiti_2017[['People affected by Natural Disasters.2']])
ND1.rename(index={72:'Number of people affected by Natural Disasters in Haiti'}, inplace=True)
ND1.rename(columns={'People affected by Natural Disasters.2':'2015'}, inplace=True)
ND1=ND1.transpose()

# Extract the data on the number of people affected by natural disaster in Haiti in 2016, 2017, 2018
ND=pd.DataFrame(haiti_2019[['People affected by Natural Disasters','People affected by Natural Disasters.1',\
                           'People affected by Natural Disasters.2']])
ND.rename(index={9:'Number of people affected by Natural Disasters in Haiti'}, inplace=True)
ND.rename(columns={'People affected by Natural Disasters':'2016', 'People affected by Natural Disasters.1':'2017',\
              'People affected by Natural Disasters.2':'2018'}, inplace=True)
ND=ND.transpose()

# Create a DataFrame containing the years and the number of people affected by natural disaster for each year
NaturalHazard=pd.concat([ND2, ND1, ND])
NaturalHazard=NaturalHazard.reset_index()
NaturalHazard.rename(columns={'index':'Year'}, inplace=True)
NaturalHazard['Number of people affected by Natural Disasters in Haiti']=\
NaturalHazard['Number of people affected by Natural Disasters in Haiti'].astype('float64')
NaturalHazard.info()

# Number of People affected by Natural Disaster per year

In [None]:
# Initialize a new figure
fig, ax = plt.subplots(figsize=(10,8))
# Draw the graph
ax.plot(NaturalHazard['Year'], NaturalHazard['Number of people affected by Natural Disasters in Haiti'])
# Set the x-axis label
ax.set_xlabel('Year')
# Set the y-axis label
ax.set_ylabel('People affected by Natural Disasters in Haiti')


# How did the natural hazard in Haiti affect the happiness?

In [None]:
# Change the names of the columns
df2019_6.rename(columns={'GDP':'Explained by: GDP per capita',\
                      'Dystopia.residual':'Dystopia (1.88) + residual',\
                      'Family':'Explained by: Social support',\
                      'Health':'Explained by: Healthy life expectancy',\
                      'Freedom':'Explained by: Freedom to make life choices', \
                      'Generosity':'Explained by: Generosity',\
                      'Corruption':'Explained by: Perceptions of corruption'}, inplace=True)

# get happiness score of haiti from 2014 to 2018
# extracted from world happiness report 2015, pg 28
score_2014=4.518
data={'Year':['2014'], 'Happiness score':[score_2014]}
score_2014=pd.DataFrame.from_dict(data)
score_2015=pd.DataFrame(df2016_2[df2016_2['Country']=='Haiti'])
score_2015.insert(0,column='Year', value='2015')
score_2016=pd.DataFrame(df2017_2[df2017_2['Country']=='Haiti'])
score_2016.insert(0,column='Year', value='2016')
score_2017=pd.DataFrame(df2018_2[df2018_2['Country']=='Haiti'])
score_2017.insert(0,column='Year', value='2017')
score_2018=pd.DataFrame(df2019_6[df2019_6['Country']=='Haiti'])
score_2018.insert(0,column='Year', value='2018')

# Change the names of the columns
df2019_6.rename(columns={'Explained by: GDP per capita':'GDP',\
                      'Dystopia (1.88) + residual':'Dystopia.residual',\
                      'Explained by: Social support':'Family',\
                      'Explained by: Healthy life expectancy' : 'Health',\
                      'Explained by: Freedom to make life choices':'Freedom', \
                      'Explained by: Generosity':'Generosity',\
                      'Explained by: Perceptions of corruption': 'Corruption'}, inplace=True)

# Create dataframe of happiness score and score in of haiti from 2014 to 2018
Hpscore=pd.concat([score_2014,score_2015, score_2016, score_2017, score_2018], sort=False)
Hpscore.reset_index(inplace=True, drop=True)
Hpscore.head()

In [9]:
#Add happiness score data to natural hazard data
NaturalHazard.insert(1, column='Happiness score', value=Hpscore['Happiness score'])
NaturalHazard.info()

NameError: name 'NaturalHazard' is not defined

In [10]:
f, axes = plt.subplots( figsize=(6, 4))
sb.lineplot(x="Year", y="Happiness score",data=NaturalHazard)
f, axes = plt.subplots( figsize=(6, 4))
sb.lineplot(x="Year", y="Number of people affected by Natural Disasters in Haiti",data=NaturalHazard)

NameError: name 'NaturalHazard' is not defined

In [11]:
Value=[]
count=0
for i in NaturalHazard["Number of people affected by Natural Disasters in Haiti"]:
    Value.insert(count,i/1000000)
    count+=1
# Use of Plotly
# Create and style traces
trace0 = go.Scatter(
    x = NaturalHazard['Year'],
    y = NaturalHazard['Happiness score'],
    name='Happiness score',
    line = dict(
        color = ('rgb(205, 12, 24)'),
        width = 4)
)
trace1 = go.Scatter(
    x = NaturalHazard['Year'],
    y =  Value,
    name = 'People affected by Natural Disasters in Haiti(in Millions)',
    line = dict(
        color = ('rgb(22, 96, 167)'),
        width = 4,)
)

data = [trace0, trace1]

# Edit the layout
layout = dict(title = 'Happiness score and Number of victims of Natural Hazard in Haiti',
              xaxis = dict(title = 'Year'),
              )

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='styled-line')

NameError: name 'NaturalHazard' is not defined

Comment: Happiness score decreases as the number of people affectedd by the natural disaster in the country increases.