# EDA and Cleaning HDI Data

In [27]:
# Import libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
# Reading in HDI data 
hdi = pd.read_csv('./aid_data/human_development/human_development_index.csv', encoding='latin-1')

According to the United Nations Development Programme: "The HDI was created to emphasize that people and their capabilities should be the ultimate criteria for assessing the development of a country, not economic growth alone. The HDI can also be used to question national policy choices, asking how two countries with the same level of GNI per capita can end up with different human development outcomes. These contrasts can stimulate debate about government policy priorities. The Human Development Index (HDI) is a summary measure of average achievement in key dimensions of human development: a long and healthy life, being knowledgeable and have a decent standard of living. The HDI is the geometric mean of normalized indices for each of the three dimensions." (http://hdr.undp.org/en/content/human-development-index-hdi)

In [29]:
# Looking at my data frame 
hdi.head()

Unnamed: 0,Country,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
0,Algeria,0.646,0.655,0.666,0.676,0.685,0.694,0.699,0.708,0.711,0.72,0.73,0.738,0.737,0.746,0.749
1,Angola,0.394,0.404,0.419,0.428,0.44,0.453,0.466,0.482,0.494,0.508,0.51,0.525,0.537,0.547,0.557
2,Bahrain,0.792,0.792,0.792,0.793,0.792,0.792,0.793,0.796,0.796,0.794,0.796,0.798,0.8,0.807,0.81
3,Benin,0.398,0.41,0.419,0.426,0.434,0.44,0.447,0.455,0.462,0.468,0.473,0.479,0.489,0.5,0.505
4,Botswana,0.578,0.58,0.576,0.583,0.589,0.598,0.612,0.625,0.638,0.647,0.66,0.676,0.687,0.699,0.709


In [30]:
# Looking at null values and data types 
hdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 16 columns):
Country    80 non-null object
2000       80 non-null object
2001       80 non-null object
2002       80 non-null object
2003       80 non-null object
2004       80 non-null object
2005       80 non-null object
2006       80 non-null object
2007       80 non-null object
2008       80 non-null object
2009       80 non-null object
2010       80 non-null object
2011       80 non-null object
2012       80 non-null object
2013       80 non-null object
2014       80 non-null object
dtypes: object(16)
memory usage: 10.1+ KB


In [31]:
# Changing all those values to numeric data types 
hdi[['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014']] = hdi[['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014']].apply(pd.to_numeric, errors='coerce')

In [32]:
# Looking at my new null values and data types 
hdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 16 columns):
Country    80 non-null object
2000       72 non-null float64
2001       72 non-null float64
2002       72 non-null float64
2003       73 non-null float64
2004       74 non-null float64
2005       77 non-null float64
2006       77 non-null float64
2007       77 non-null float64
2008       77 non-null float64
2009       77 non-null float64
2010       78 non-null float64
2011       78 non-null float64
2012       78 non-null float64
2013       78 non-null float64
2014       78 non-null float64
dtypes: float64(15), object(1)
memory usage: 10.1+ KB


In [33]:
# Setting country as the index 
hdi = hdi.set_index('Country')

In [34]:
# Stacking the data frame 
hdi = hdi.stack(dropna=False)

In [35]:
# Looking at my new data frame 
hdi.head(20)

Country      
Algeria  2000    0.646
         2001    0.655
         2002    0.666
         2003    0.676
         2004    0.685
         2005    0.694
         2006    0.699
         2007    0.708
         2008    0.711
         2009    0.720
         2010    0.730
         2011    0.738
         2012    0.737
         2013    0.746
         2014    0.749
Angola   2000    0.394
         2001    0.404
         2002    0.419
         2003    0.428
         2004    0.440
dtype: float64

In [36]:
# Saving my HDI to a data frame 
hdi = hdi.to_frame()

In [37]:
# Looking at my data frame 
hdi.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Algeria,2000,0.646
Algeria,2001,0.655
Algeria,2002,0.666
Algeria,2003,0.676
Algeria,2004,0.685


In [38]:
# Renaming my blank column to year 
hdi.rename(columns = {list(hdi)[0]: 'year'}, inplace = True)

In [39]:
# Saving the dataframe to a csv
hdi.to_csv('./aid_data/human_development/hdi.csv', index=True)

In [40]:
# Reading the new csv 
hdi = pd.read_csv('./aid_data/human_development/hdi.csv', encoding='latin-1')

In [41]:
# Checking out the head of my data frame 
hdi.head()

Unnamed: 0,Country,Unnamed: 1,year
0,Algeria,2000,0.646
1,Algeria,2001,0.655
2,Algeria,2002,0.666
3,Algeria,2003,0.676
4,Algeria,2004,0.685


In [42]:
# Renaming my columns 
hdi.rename(columns={'Unnamed: 1': 'year', 'year': 'hdi', 'Country': 'country'}, inplace=True)

In [43]:
# Checking out null values and data types 
hdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 3 columns):
country    1200 non-null object
year       1200 non-null int64
hdi        1138 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 28.2+ KB


In [44]:
# Looking at the mean values 
hdi.mean()

year    2007.00000
hdi        0.55108
dtype: float64

In [45]:
# Looking at the mean by year 
hdi.groupby('year')['hdi'].mean()

year
2000    0.507431
2001    0.512597
2002    0.517708
2003    0.521890
2004    0.530500
2005    0.536338
2006    0.544234
2007    0.551636
2008    0.558948
2009    0.565390
2010    0.569372
2011    0.576308
2012    0.582705
2013    0.587179
2014    0.592167
Name: hdi, dtype: float64

In [46]:
# Filling in null values with the mean for that year 
hdi['hdi'] = hdi.groupby('year')['hdi'].transform(lambda x: x.fillna(x.mean()))

In [47]:
# Checking to see that my null values are filled in 
hdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 3 columns):
country    1200 non-null object
year       1200 non-null int64
hdi        1200 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 28.2+ KB


In [48]:
# Looking at the unique country values 
hdi['country'].unique()

array(['Algeria', 'Angola', 'Bahrain', 'Benin', 'Botswana',
       'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cameroon',
       'Central African Republic', 'Chad', 'Comoros', 'Congo',
       'Congo (Democratic Republic of the)', "CÃ´te d'Ivoire", 'Djibouti',
       'Egypt', 'Equatorial Guinea', 'Eritrea', 'Ethiopia', 'Gabon',
       'Gambia', 'Ghana', 'Guinea', 'Guinea-Bissau',
       'Iran (Islamic Republic of)', 'Iraq', 'Israel', 'Jordan', 'Kenya',
       'Kuwait', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Madagascar',
       'Malawi', 'Maldives', 'Mali', 'Mauritania', 'Mauritius', 'Morocco',
       'Mozambique', 'Namibia', 'Niger', 'Nigeria', 'Oman',
       'Palestine, State of', 'Qatar', 'Rwanda', 'Sao Tome and Principe',
       'Saudi Arabia', 'Senegal', 'Sierra Leone', 'South Africa',
       'South Sudan', 'Sudan', 'Syrian Arab Republic',
       'Tanzania (United Republic of)', 'Togo', 'Tunisia', 'Turkey',
       'Uganda', 'United Arab Emirates', 'Yemen', 'Zambia', 'Zimbabwe',
  

In [49]:
# Renaming countries for consistency with other data 
hdi.replace({'country' : {'Congo (Democratic Republic of the)' : 'Democratic Republic of Congo',
                           "CÃ´te d'Ivoire" : "Cote D'Ivoire",
                           'Iran (Islamic Republic of)' : 'Iran',
                           'Palestine, State of' : 'Palestine',
                           'Syrian Arab Republic' : 'Syria',
                           'Tanzania (United Republic of)' : 'Tanzania',
                           }}, inplace=True)

In [50]:
# Removing these rows I will not use 
hdi = hdi.drop(hdi.loc[hdi['country'] == 'Human Development'].index)
hdi = hdi.drop(hdi.loc[hdi['country'] == 'Low human development'].index)
hdi = hdi.drop(hdi.loc[hdi['country'] == 'Medium human development'].index)
hdi = hdi.drop(hdi.loc[hdi['country'] == 'High human development'].index)
hdi = hdi.drop(hdi.loc[hdi['country'] == 'Very high human development'].index)
hdi = hdi.drop(hdi.loc[hdi['country'] == 'Developing Countries'].index)
hdi = hdi.drop(hdi.loc[hdi['country'] == 'Regions'].index)
hdi = hdi.drop(hdi.loc[hdi['country'] == 'Arab States'].index)
hdi = hdi.drop(hdi.loc[hdi['country'] == 'Sub-Saharan Africa'].index)
hdi = hdi.drop(hdi.loc[hdi['country'] == 'Least Developed Countries'].index)
hdi = hdi.drop(hdi.loc[hdi['country'] == 'Small Island Developing States'].index)
hdi = hdi.drop(hdi.loc[hdi['country'] == 'World'].index)
hdi = hdi.drop(hdi.loc[hdi['country'] == 'Maldives'].index)

In [51]:
# Looking at my new data frame 
hdi.head()

Unnamed: 0,country,year,hdi
0,Algeria,2000,0.646
1,Algeria,2001,0.655
2,Algeria,2002,0.666
3,Algeria,2003,0.676
4,Algeria,2004,0.685


In [52]:
# Saving my data to a csv
hdi.to_csv('./aid_data/human_development/hdi.csv', index=False)