In [1]:
#Import Dependencies
import pandas as pd
import csv
import matplotlib.pyplot as plt
import numpy as np

# Loading and Cleaning Data

In [2]:
#loading the master file and variable imports
file = "resources/life_expectancy.csv"
thedata = pd.read_csv(file)
themajordata = thedata
#lowercase and remove spaces/parenthesis from country names
thedata['Country'] = thedata['Country'].replace(' ','', regex=True).\
                    replace('/','', regex=True).replace(',','', regex=True).replace('-','', regex=True).\
                    str.lower().str.split('(').str[0]

#rename columns
for x in range(len(thedata.columns)):
    thedata = thedata.rename(columns={thedata.columns[x]:thedata.columns[x].title().replace("  "," ")})
    if thedata.columns[x][0]==" ":
        thedata = thedata.rename(columns={thedata.columns[x]:thedata.columns[x][1:]})
    if thedata.columns[x][-1]==" ":
        thedata = thedata.rename(columns={thedata.columns[x]:thedata.columns[x][:-1]})
thedata.columns

Index(['Country', 'Year', 'Status', 'Life Expectancy', 'Adult Mortality',
       'Infant Deaths', 'Alcohol', 'Percentage Expenditure', 'Hepatitis B',
       'Measles', 'Bmi', 'Under-Five Deaths', 'Polio', 'Total Expenditure',
       'Diphtheria', 'Hiv/Aids', 'Gdp', 'Population', 'Thinness 1-19 Years',
       'Thinness 5-9 Years', 'Income Composition Of Resources', 'Schooling'],
      dtype='object')

In [3]:
#Create the Data Range selection for the other csv files
startyear=2000
endyear=2015
yearlist = ["Country"]
yearlistloop = [yearlist.append(str(x)) for x in range(startyear, endyear+1)]

In [4]:
#Create the list of csvs to loop through
files = {}
values = ["fdi", "birth_per_woman",  "water","military","cellphone"]
#values = ["fdi", "birth_per_woman", "roads", "water","sanitation", "roads","tax","military","democracy","armsimport", "armsexport",  "middleincome","math4", "gini","out_of_pocket_share","aid_perperson","poverty","electricity","cellphone"]
for i in range(len(values)):
     files[f"resources/{values[i]}.csv"] = values[i].title().replace("_"," ")
print(files)

{'resources/fdi.csv': 'Fdi', 'resources/birth_per_woman.csv': 'Birth Per Woman', 'resources/water.csv': 'Water', 'resources/military.csv': 'Military', 'resources/cellphone.csv': 'Cellphone'}


In [5]:
#Data storage
all_data=[]
listcountry= []
#File Loop
for key,value in files.items():
    if key[-3:]=="csv":
        try:
            data = pd.read_csv(key)
            data = data.rename(columns={"country":"Country"})
        except:
            #WorldBank Data
            data = pd.read_csv(key, header = [2])
            data = data.rename(columns={"Country Name":"Country"})
    data["Country"] = data["Country"].str.lower().str.replace(' ', '', regex=True)
    #change country names
    data["Country"].str.replace('st.', 'saint', regex=True)
    data["Country"].str.replace("coted'ivoire", "côted'ivoire", regex=True)
    data["Country"].str.replace(",the", '', regex=True)
    data["Country"].str.replace("korea,dem.peopleâ€™srep.", "democraticpeople'srepublicofkorea", regex=True)
    data["Country"].str.replace("russia", "russianfederation", regex=True)
    data["Country"].str.replace('egypt,arabrep.', 'egypt', regex=True)
    data["Country"].str.replace('korea,rep.', 'republicofkorea', regex=True)
    data["Country"].str.replace('northmacedonia', 'theformeryugoslavrepublicofmacedonia', regex=True)
    data["Country"].str.replace(',fed.sts.', '', regex=True)
    data["Country"].str.replace(',rb', '', regex=True)
    data["Country"].str.replace('tanzania', 'unitedrepublicoftanzania', regex=True)
    data["Country"].str.replace('yemen,rep.', 'yemen', regex=True)
    data["Country"].str.replace('laopdr', "laopeople'sdemocraticrepublic", regex=True)
    data["Country"].str.replace('unitedstates', 'unitedstatesofamerica', regex=True)
    data["Country"].str.replace('moldov', 'republicofmoldov', regex=True)
    data["Country"].str.replace('congo,dem.rep.', 'democraticrepublicofthecongo', regex=True)
    data["Country"].str.replace('congo,rep', 'congo', regex=True)
    listcountry.append(data["Country"].unique())
    try:
        data = pd.melt(data[yearlist], id_vars=["Country"], value_vars=yearlist[1:])
    except:
        #data incomplete for the year range
        columns = ["Country"]
        for col in data.columns[1:]:
            if int(col)>= startyear:
                columns.append(col)
        data = pd.melt(data[columns], id_vars=["Country"], value_vars=columns[1:])
    new = data.rename(columns={"variable":"Year", "value":value})
    new["Year"] = new["Year"].astype({'Year': 'int64'})
    all_data.append(new)

# Merging Data

In [8]:
#looping list to merge all the data
alldata = thedata.copy()
for x in range(len(all_data)-1): 
    alldata = alldata.merge(all_data[x], on=["Country","Year"], how='left')
alldata = pd.get_dummies(alldata, columns=["Status"])
alldata = alldata.drop(columns=['Status_Developing',"Income Composition Of Resources"])
alldata = alldata.rename(columns={'Status_Developed':'Status Developed'})
alldata.columns

Index(['Country', 'Year', 'Life Expectancy', 'Adult Mortality',
       'Infant Deaths', 'Alcohol', 'Percentage Expenditure', 'Hepatitis B',
       'Measles', 'Bmi', 'Under-Five Deaths', 'Polio', 'Total Expenditure',
       'Diphtheria', 'Hiv/Aids', 'Gdp', 'Population', 'Thinness 1-19 Years',
       'Thinness 5-9 Years', 'Schooling', 'Fdi', 'Birth Per Woman', 'Water',
       'Military', 'Status Developed'],
      dtype='object')

In [9]:
alldata.count()

Country                   2938
Year                      2938
Life Expectancy           2928
Adult Mortality           2928
Infant Deaths             2938
Alcohol                   2744
Percentage Expenditure    2938
Hepatitis B               2385
Measles                   2938
Bmi                       2904
Under-Five Deaths         2938
Polio                     2919
Total Expenditure         2712
Diphtheria                2919
Hiv/Aids                  2938
Gdp                       2490
Population                2286
Thinness 1-19 Years       2904
Thinness 5-9 Years        2904
Schooling                 2775
Fdi                       2439
Birth Per Woman           2560
Water                     2544
Military                  2086
Status Developed          2938
dtype: int64

In [10]:
#Create master CSV
alldata.to_csv('resources/all_data.csv')
alldatafinal = alldata.dropna(how="any")
alldatafinal.to_csv('resources/all_data_final.csv')