In [72]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np

In [73]:
all_variables = pd.read_csv('resources/life_expect_merged.csv')
all_variables.head()

Unnamed: 0.1,Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,...,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling,current_amount,pocket_health_spend
0,0,afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,...,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1,17235710000.0,
1,1,afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,...,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,14333710000.0,
2,2,afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,...,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9,19699100000.0,
3,3,afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,...,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8,23204390000.0,
4,4,afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,...,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5,24866610000.0,


In [74]:
#select only variables to be used in model
cols = ['Life expectancy', 'Status','percentage expenditure', 'GDP', 'Schooling', 'pocket_health_spend', 'Income composition of resources' ]
model_vars = all_variables[cols].copy()

#transform categorical variable into numerical using get dummies
model_vars = pd.get_dummies(model_vars, columns=["Status"])

#drop one of the dummy variables
model_vars = model_vars.drop(columns=['Status_Developing'])

#drop rows with NaNs
model_vars.dropna(how='any', inplace=True)

model_vars.head()

Unnamed: 0,Life expectancy,percentage expenditure,GDP,Schooling,pocket_health_spend,Income composition of resources,Status_Developed
5,58.8,79.679367,553.32894,9.2,83.0,0.448,0
6,58.6,56.762217,445.893298,8.9,83.1,0.434,0
7,58.1,25.873925,373.361116,8.7,82.9,0.433,0
8,57.5,10.910156,369.835796,8.4,82.6,0.415,0
9,57.3,17.171518,272.56377,8.1,82.9,0.405,0


# Random Forest

In [75]:
#declare x and y variables
X = model_vars[['Status_Developed','percentage expenditure', 'GDP', 'Schooling', 'pocket_health_spend', 'Income composition of resources']]
y = model_vars['Life expectancy']
print(X.shape, y.shape)

(1293, 6) (1293,)


In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [77]:
rf = RandomForestRegressor(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.8952033978760823

In [78]:
feature_names = X.columns
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.8035292008171484, 'Income composition of resources'),
 (0.06207490658004745, 'pocket_health_spend'),
 (0.05328038031972832, 'GDP'),
 (0.04699744774080403, 'Schooling'),
 (0.033684983983891754, 'percentage expenditure'),
 (0.0004330805583799835, 'Status_Developed')]