In [32]:
#Import Dependencies
import pandas as pd
import csv
import matplotlib.pyplot as plt
import numpy as np

# Loading and Cleaning Data

In [33]:
#loading the master file and variable imports
file = "resources/life_expectancy.csv"
thedata = pd.read_csv(file)
themajordata = thedata
#lowercase and remove spaces/parenthesis from country names
thedata['Country'] = thedata['Country'].replace(' ','', regex=True).\
                    replace('/','', regex=True).replace(',','', regex=True).replace('-','', regex=True).\
                    str.lower().str.split('(').str[0]

#rename columns
for x in range(len(thedata.columns)):
    thedata = thedata.rename(columns={thedata.columns[x]:thedata.columns[x].lower().replace(" ","")})
thedata.head()

Unnamed: 0,country,year,status,lifeexpectancy,adultmortality,infantdeaths,alcohol,percentageexpenditure,hepatitisb,measles,...,polio,totalexpenditure,diphtheria,hiv/aids,gdp,population,thinness1-19years,thinness5-9years,incomecompositionofresources,schooling
0,afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [34]:
#Create the Data Range selection for the other csv files
startyear=2000
endyear=2015
yearlist = ["country"]
yearlistloop = [yearlist.append(str(x)) for x in range(startyear, endyear+1)]

In [41]:
#Create the list of csvs to loop through
files = {}
values = ["fdi", "birth_per_woman", "water", "math4", "gini","out_of_pocket_share","aid_perperson","poverty","electricity","cellphone"]
for i in range(len(values)):
     files[f"resources/{values[i]}.csv"] = values[i]
print(files)

{'resources/fdi.csv': 'fdi', 'resources/birth_per_woman.csv': 'birth_per_woman', 'resources/water.csv': 'water', 'resources/math4.csv': 'math4', 'resources/gini.csv': 'gini', 'resources/out_of_pocket_share.csv': 'out_of_pocket_share', 'resources/aid_perperson.csv': 'aid_perperson', 'resources/poverty.csv': 'poverty', 'resources/electricity.csv': 'electricity', 'resources/cellphone.csv': 'cellphone'}


In [42]:
#Data storage
all_data=[]

#File Loop
for key,value in files.items():
    if key[-3:]=="csv":
        try:
            data = pd.read_csv(key)
        except:
            #WorldBank Data
            data = pd.read_csv(key, header = [2])
            data = data.rename(columns={"Country Name":"country"})

    data["country"] = data["country"].str.lower().str.replace(' ', '', regex=True)
    #change country names
    data["country"].str.replace('st.', 'saint', regex=True)
    data["country"].str.replace("coted'ivoire", "côted'ivoire", regex=True)
    data["country"].str.replace(",the", '', regex=True)
    data["country"].str.replace("korea,dem.peopleâ€™srep.", "democraticpeople'srepublicofkorea", regex=True)
    data["country"].str.replace('egypt,arabrep.', 'egypt', regex=True)
    data["country"].str.replace('korea,rep.', 'republicofkorea', regex=True)
    data["country"].str.replace('northmacedonia', 'theformeryugoslavrepublicofmacedonia', regex=True)
    data["country"].str.replace(',fed.sts.', '', regex=True)
    data["country"].str.replace(',rb', '', regex=True)
    data["country"].str.replace('tanzania', 'unitedrepublicoftanzania', regex=True)
    data["country"].str.replace('yemen,rep.', 'yemen', regex=True)
    data["country"].str.replace('laopdr', "laopeople'sdemocraticrepublic", regex=True)
    data["country"].str.replace('unitedstates', 'unitedstatesofamerica', regex=True)
    data["country"].str.replace('moldov', 'republicofmoldov', regex=True)
    data["country"].str.replace('congo,dem.rep.', 'democraticrepublicofthecongo', regex=True)
    data["country"].str.replace('congo,rep', 'congo', regex=True)

    try:
        data = pd.melt(data[yearlist], id_vars=["country"], value_vars=yearlist[1:])
    except:
        #data incomplete for the year range
        columns = ["country"]
        for col in data.columns[1:]:
            if int(col)>= startyear:
                columns.append(col)
        data = pd.melt(data[columns], id_vars=["country"], value_vars=columns[1:])
    new = data.rename(columns={"variable":"year", "value":value})
    new["year"] = new["year"].astype({'year': 'int64'})
    all_data.append(new)
    new.head()

In [43]:
#Other Data
#US Aid Data
file = "resources/us_aid.csv"
new = pd.read_csv(file)
#group the us_aid data by country and year to match the life expactancy dataset
new = new.groupby(['country_name', 'fiscal_year'])['current_amount'].sum()
new = new.reset_index()
new = new.rename(columns={"country_name":"country", "fiscal_year":"year", "current_amount":"us_aid"})
new.head()
all_data.append(new)

# Merging Data

In [50]:
#looping list to merge all the data
alldata = thedata.copy()
for x in range(len(all_data)-1): 
    alldata = alldata.merge(all_data[x], on=["country","year"])
alldata.head() 

Unnamed: 0,country,year,status,lifeexpectancy,adultmortality,infantdeaths,alcohol,percentageexpenditure,hepatitisb,measles,...,fdi_y,birth_per_woman_y,water_y,math4_y,gini_y,out_of_pocket_share_y,aid_perperson_y,poverty_y,electricity_y,cellphone
0,algeria,2007,Developing,73.8,129.0,20,0.44,320.323924,9.0,0,...,1.249631,2.66,91.7,378.0,,21.5,11.561165,,532.0,27562721.0
1,algeria,2006,Developing,73.4,132.0,20,0.36,270.240196,8.0,944,...,1.573088,2.58,91.4,,,24.3,7.154365,,504.0,20997954.0
2,algeria,2005,Developing,72.9,136.0,19,0.5,2.548923,83.0,2302,...,1.120172,2.5,91.2,,,26.4,10.479122,,533.0,13661355.0
3,algeria,2004,Developing,72.3,14.0,19,0.45,220.393699,81.0,3289,...,1.033522,2.44,90.9,,,27.4,9.695596,,466.0,4882414.0
4,algeria,2003,Developing,71.7,146.0,20,0.34,25.018523,,15374,...,0.939909,2.41,90.7,,,20.3,7.41504,,452.0,1446927.0


In [51]:
#Create master CSV
alldata.to_csv('resources/all_data.csv')