In [1]:
#Import Dependencies
import pandas as pd
import csv
import matplotlib.pyplot as plt
import numpy as np

# Loading and Cleaning Data

In [2]:
#loading the master file and variable imports
file = "resources/life_expectancy.csv"
thedata = pd.read_csv(file)
themajordata = thedata
#lowercase and remove spaces/parenthesis from country names
thedata['Country'] = thedata['Country'].replace(' ','', regex=True).\
                    replace('/','', regex=True).replace(',','', regex=True).replace('-','', regex=True).\
                    str.lower().str.split('(').str[0]

#rename columns
for x in range(len(thedata.columns)):
    thedata = thedata.rename(columns={thedata.columns[x]:thedata.columns[x].lower().replace(" ","")})
thedata.head()

Unnamed: 0,country,year,status,lifeexpectancy,adultmortality,infantdeaths,alcohol,percentageexpenditure,hepatitisb,measles,...,polio,totalexpenditure,diphtheria,hiv/aids,gdp,population,thinness1-19years,thinness5-9years,incomecompositionofresources,schooling
0,afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [3]:
#Create the Data Range selection for the other csv files
startyear=2000
endyear=2015
yearlist = ["country"]
yearlistloop = [yearlist.append(str(x)) for x in range(startyear, endyear+1)]

In [4]:
#Create the list of csvs to loop through
files = {}
values = ["fdi", "birth_per_woman",  "water", "roads","military","cellphone"]
#values = ["fdi", "birth_per_woman", "roads", "water","sanitation", "roads","tax","military","democracy","armsimport", "armsexport",  "middleincome","math4", "gini","out_of_pocket_share","aid_perperson","poverty","electricity","cellphone"]
for i in range(len(values)):
     files[f"resources/{values[i]}.csv"] = values[i]
print(files)

{'resources/fdi.csv': 'fdi', 'resources/birth_per_woman.csv': 'birth_per_woman', 'resources/water.csv': 'water', 'resources/roads.csv': 'roads', 'resources/military.csv': 'military', 'resources/cellphone.csv': 'cellphone'}


In [5]:
#Data storage
all_data=[]
listcountry= []
#File Loop
for key,value in files.items():
    if key[-3:]=="csv":
        try:
            data = pd.read_csv(key)
        except:
            #WorldBank Data
            data = pd.read_csv(key, header = [2])
            data = data.rename(columns={"Country Name":"country"})

    data["country"] = data["country"].str.lower().str.replace(' ', '', regex=True)
    #change country names
    data["country"].str.replace('st.', 'saint', regex=True)
    data["country"].str.replace("coted'ivoire", "côted'ivoire", regex=True)
    data["country"].str.replace(",the", '', regex=True)
    data["country"].str.replace("korea,dem.peopleâ€™srep.", "democraticpeople'srepublicofkorea", regex=True)
    data["country"].str.replace('egypt,arabrep.', 'egypt', regex=True)
    data["country"].str.replace('korea,rep.', 'republicofkorea', regex=True)
    data["country"].str.replace('northmacedonia', 'theformeryugoslavrepublicofmacedonia', regex=True)
    data["country"].str.replace(',fed.sts.', '', regex=True)
    data["country"].str.replace(',rb', '', regex=True)
    data["country"].str.replace('tanzania', 'unitedrepublicoftanzania', regex=True)
    data["country"].str.replace('yemen,rep.', 'yemen', regex=True)
    data["country"].str.replace('laopdr', "laopeople'sdemocraticrepublic", regex=True)
    data["country"].str.replace('unitedstates', 'unitedstatesofamerica', regex=True)
    data["country"].str.replace('moldov', 'republicofmoldov', regex=True)
    data["country"].str.replace('congo,dem.rep.', 'democraticrepublicofthecongo', regex=True)
    data["country"].str.replace('congo,rep', 'congo', regex=True)
    listcountry.append(data["country"].unique())
    try:
        data = pd.melt(data[yearlist], id_vars=["country"], value_vars=yearlist[1:])
    except:
        #data incomplete for the year range
        columns = ["country"]
        for col in data.columns[1:]:
            if int(col)>= startyear:
                columns.append(col)
        data = pd.melt(data[columns], id_vars=["country"], value_vars=columns[1:])
    new = data.rename(columns={"variable":"year", "value":value})
    new["year"] = new["year"].astype({'year': 'int64'})
    all_data.append(new)
    new.head()

In [6]:
#Other Data
#US Aid Data
file = "resources/us_aid.csv"
new = pd.read_csv(file)
#group the us_aid data by country and year to match the life expactancy dataset
new = new.groupby(['country_name', 'fiscal_year'])['current_amount'].sum()
new = new.reset_index()
new = new.rename(columns={"country_name":"country", "fiscal_year":"year", "current_amount":"us_aid"})
new.head()
all_data.append(new)

# Merging Data

In [7]:
#looping list to merge all the data
alldata = thedata.copy()
for x in range(len(all_data)-1): 
    alldata = alldata.merge(all_data[x], on=["country","year"], how='left')

alldata = pd.get_dummies(alldata, columns=["status"])
alldata = alldata.drop(columns=['status_Developing'])
alldata.head() 

Unnamed: 0,country,year,lifeexpectancy,adultmortality,infantdeaths,alcohol,percentageexpenditure,hepatitisb,measles,bmi,...,thinness5-9years,incomecompositionofresources,schooling,fdi,birth_per_woman,water,roads,military,cellphone,status_Developed
0,afghanistan,2015,65.0,263.0,62,0.01,71.279624,65.0,1154,19.1,...,17.3,0.479,10.1,0.849679,4.8,61.5,,0.993455,19709038.0,0
1,afghanistan,2014,59.9,271.0,64,0.01,73.523582,62.0,492,18.6,...,17.5,0.476,10.0,0.20979,4.98,58.8,,1.298013,18407168.0,0
2,afghanistan,2013,59.9,268.0,66,0.01,73.219243,64.0,430,18.1,...,17.7,0.47,9.9,0.234965,5.17,56.2,,1.07695,16807156.0,0
3,afghanistan,2012,59.5,272.0,69,0.01,78.184215,67.0,2787,17.6,...,18.0,0.463,9.8,0.284095,5.38,53.5,,1.175417,15340115.0,0
4,afghanistan,2011,59.2,275.0,71,0.01,7.097109,68.0,3013,17.2,...,18.2,0.454,9.5,0.293039,5.6,50.9,,1.821346,13797879.0,0


In [8]:
alldata.count()

country                         2938
year                            2938
lifeexpectancy                  2928
adultmortality                  2928
infantdeaths                    2938
alcohol                         2744
percentageexpenditure           2938
hepatitisb                      2385
measles                         2938
bmi                             2904
under-fivedeaths                2938
polio                           2919
totalexpenditure                2712
diphtheria                      2919
hiv/aids                        2938
gdp                             2490
population                      2286
thinness1-19years               2904
thinness5-9years                2904
incomecompositionofresources    2771
schooling                       2775
fdi                             2439
birth_per_woman                 2560
water                           2544
roads                            606
military                        2086
cellphone                       2492
s

In [9]:
#Create master CSV
alldata.to_csv('resources/all_data.csv')
alldata.dropna(how="any").to_csv('resources/all_data_final.csv')

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [11]:
#Perform Regression on every variable

results ={}
for value in alldata.columns[3:]:
    #Choose columns
    fewdata = alldata[["country","year","lifeexpectancy",value]].copy().dropna(how="any")
    X = fewdata[value].values.reshape(-1, 1)
    y =fewdata["lifeexpectancy"].values.reshape(-1, 1)
    print(X.shape, y.shape)
    
    #Split Data into Train and Test
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    model = LinearRegression()

    model.fit(X_train, y_train)
    training_score = model.score(X_train, y_train)
    testing_score = model.score(X_test, y_test)
    

    model.fit(X_train, y_train)
    training_score = model.score(X_train, y_train)
    testing_score = model.score(X_test, y_test)

    
    results[value] = {"Training Score":training_score,"Testing Score": testing_score, "Coefficent": model.coef_[0][0], "y intercept": model.intercept_[0]}
    print(f"{value.title()} Training Score: {training_score}")
    print(f"{value.title()} Testing Score: {testing_score}")
    print('Weight coefficients: ', model.coef_)
    print('y-axis intercept: ', model.intercept_) 

(2928, 1) (2928, 1)
Adultmortality Training Score: 0.47032501692888096
Adultmortality Testing Score: 0.5266710454005155
Weight coefficients:  [[-0.05294443]]
y-axis intercept:  [77.9167821]
(2928, 1) (2928, 1)
Infantdeaths Training Score: 0.03798116719483913
Infantdeaths Testing Score: 0.03984536410629402
Weight coefficients:  [[-0.01574837]]
y-axis intercept:  [69.63364861]
(2735, 1) (2735, 1)
Alcohol Training Score: 0.16296320546563225
Alcohol Testing Score: 0.15950323547734543
Weight coefficients:  [[0.97100642]]
y-axis intercept:  [64.47660515]
(2928, 1) (2928, 1)
Percentageexpenditure Training Score: 0.15120324726049716
Percentageexpenditure Testing Score: 0.12764719151795845
Weight coefficients:  [[0.00177727]]
y-axis intercept:  [67.82526716]
(2375, 1) (2375, 1)
Hepatitisb Training Score: 0.06895449609230975
Hepatitisb Testing Score: 0.05490031528543782
Weight coefficients:  [[0.089666]]
y-axis intercept:  [62.58740817]
(2928, 1) (2928, 1)
Measles Training Score: 0.0281017330130

In [12]:
scores = pd.DataFrame(results)
scores.transpose().sort_values(by=['Testing Score'], ascending=False)

Unnamed: 0,Training Score,Testing Score,Coefficent,y intercept
birth_per_woman,0.665854,0.703738,-5.028906,84.72992
water,0.680048,0.664555,0.4100001,35.240076
schooling,0.567887,0.556171,2.142702,43.58324
adultmortality,0.470325,0.526671,-0.05294443,77.916782
incomecompositionofresources,0.525356,0.524763,32.49864,48.93085
hiv/aids,0.306189,0.316904,-1.006561,70.881937
bmi,0.328057,0.304017,0.2732487,58.86216
status_Developed,0.228417,0.244501,12.0798,67.035044
thinness1-19years,0.224045,0.237092,-0.9999149,74.175271
roads,0.309013,0.231209,0.1548392,62.220588
