# Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
import scipy
from scipy.stats import norm
from scipy.stats import t
from scipy.stats import f
from scipy.stats import chi2_contingency
from scipy.stats import chi2
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn import metrics
from datetime import datetime
from sklearn.linear_model import Lasso, Ridge, LinearRegression
import pickle

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
plt.style.use('seaborn')

# Clean/Impute/Drop

In [51]:
df = pd.read_csv('life.csv')

# Dropping and Imputing
df2 = df.drop(columns=['Hepatitis B'])

## Imputing

df2.Population = df.Population.fillna(df.Population.median())
df2.GDP = df.GDP.fillna(df.GDP.median())
df2.Schooling = df.Schooling.fillna(df.Schooling.median())
df2['Total expenditure'] = df['Total expenditure'].fillna(df['Total expenditure'].median())
df2['Income composition of resources'] = df['Income composition of resources'].fillna(df['Income composition of resources'].median())
df2.Alcohol = df.Alcohol.fillna(df.Alcohol.median())

## Dropping

df2 = df2.dropna(subset =['Life expectancy ',])
df2 = df2.dropna()
df2.isna().sum()
corr = df2.corr()
renamed = []
for i in df2.columns:
    renamed.append(i.lower().strip().replace(' ', '_').replace('-','_').replace('/','_'))
rename_dict = dict(zip(df2.columns, renamed))
df2.rename(columns=rename_dict, inplace=True)
shorten = {'life_expectancy':'lifex', \
           'percentage_expenditure':'perc_expend', \
           'total_expenditure':'tot_expend', \
          'population':'pop','income_composition_of_resources':'income_comp'}
df2.rename(columns=shorten, inplace=True)

# Model with Dummies (Best Model)

In [54]:
features = df2.drop(columns=['lifex','country','year','status'])

In [55]:
target = (df2.lifex)

In [56]:
cou_dum = pd.get_dummies(df2.country)

In [57]:
df3 = pd.concat([df2, cou_dum], axis='columns')

In [58]:
df3.status = np.where(df3.status=='Developing', 0,1)

In [59]:
df3 = df3.drop(columns=['country', 'Angola'], axis=1)

In [60]:
df3

Unnamed: 0,year,status,lifex,adult_mortality,infant_deaths,alcohol,perc_expend,measles,bmi,under_five_deaths,polio,tot_expend,diphtheria,hiv_aids,gdp,pop,thinness__1_19_years,thinness_5_9_years,income_comp,schooling,Afghanistan,Albania,Algeria,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahamas,Bahrain,Bangladesh,Barbados,Belarus,Belgium,Belize,Benin,Bhutan,Bolivia (Plurinational State of),Bosnia and Herzegovina,Botswana,Brazil,Brunei Darussalam,Bulgaria,Burkina Faso,Burundi,Cabo Verde,Cambodia,Cameroon,Canada,Central African Republic,Chad,Chile,China,Colombia,Comoros,Congo,Costa Rica,Croatia,Cuba,Cyprus,Czechia,Côte d'Ivoire,Democratic People's Republic of Korea,Democratic Republic of the Congo,Denmark,Djibouti,Dominican Republic,Ecuador,Egypt,El Salvador,Equatorial Guinea,Eritrea,Estonia,Ethiopia,Fiji,Finland,France,Gabon,Gambia,Georgia,Germany,Ghana,Greece,Grenada,Guatemala,Guinea,Guinea-Bissau,Guyana,Haiti,Honduras,Hungary,Iceland,India,Indonesia,Iran (Islamic Republic of),Iraq,Ireland,Israel,Italy,Jamaica,Japan,Jordan,Kazakhstan,Kenya,Kiribati,Kuwait,Kyrgyzstan,Lao People's Democratic Republic,Latvia,Lebanon,Lesotho,Liberia,Libya,Lithuania,Luxembourg,Madagascar,Malawi,Malaysia,Maldives,Mali,Malta,Mauritania,Mauritius,Mexico,Micronesia (Federated States of),Mongolia,Montenegro,Morocco,Mozambique,Myanmar,Namibia,Nepal,Netherlands,New Zealand,Nicaragua,Niger,Nigeria,Norway,Oman,Pakistan,Panama,Papua New Guinea,Paraguay,Peru,Philippines,Poland,Portugal,Qatar,Republic of Korea,Republic of Moldova,Romania,Russian Federation,Rwanda,Saint Lucia,Saint Vincent and the Grenadines,Samoa,Sao Tome and Principe,Saudi Arabia,Senegal,Serbia,Seychelles,Sierra Leone,Singapore,Slovakia,Slovenia,Solomon Islands,Somalia,South Africa,Spain,Sri Lanka,Suriname,Swaziland,Sweden,Switzerland,Syrian Arab Republic,Tajikistan,Thailand,The former Yugoslav republic of Macedonia,Timor-Leste,Togo,Tonga,Trinidad and Tobago,Tunisia,Turkey,Turkmenistan,Uganda,Ukraine,United Arab Emirates,United Kingdom of Great Britain and Northern Ireland,United Republic of Tanzania,United States of America,Uruguay,Uzbekistan,Vanuatu,Venezuela (Bolivarian Republic of),Viet Nam,Yemen,Zambia,Zimbabwe
0,2015,0,65.0,263.0,62,0.01,71.279624,1154,19.1,83,6.0,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2014,0,59.9,271.0,64,0.01,73.523582,492,18.6,86,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2013,0,59.9,268.0,66,0.01,73.219243,430,18.1,89,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.470,9.9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2012,0,59.5,272.0,69,0.01,78.184215,2787,17.6,93,67.0,8.52,67.0,0.1,669.959000,3696958.0,17.9,18.0,0.463,9.8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2011,0,59.2,275.0,71,0.01,7.097109,3013,17.2,97,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,2004,0,44.3,723.0,27,4.36,0.000000,31,27.1,42,67.0,7.13,65.0,33.6,454.366654,12777511.0,9.4,9.4,0.407,9.2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2934,2003,0,44.5,715.0,26,4.06,0.000000,998,26.7,41,7.0,6.52,68.0,36.7,453.351155,12633897.0,9.8,9.9,0.418,9.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2935,2002,0,44.8,73.0,25,4.43,0.000000,304,26.3,40,73.0,6.53,71.0,39.8,57.348340,125525.0,1.2,1.3,0.427,10.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2936,2001,0,45.3,686.0,25,1.72,0.000000,529,25.9,39,76.0,6.16,75.0,42.1,548.587312,12366165.0,1.6,1.7,0.427,9.8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [61]:
features = df3.drop(columns='lifex')

In [62]:
renamed = []
for i in features.columns:
    renamed.append(i.lower().strip().replace(' ', '_').replace('-','_').replace('/','_').replace('(','_').replace(')','_').replace("'",'_'))

In [63]:
rename_dict = dict(zip(features.columns, renamed))

In [64]:
rename_dict

{'year': 'year',
 'status': 'status',
 'adult_mortality': 'adult_mortality',
 'infant_deaths': 'infant_deaths',
 'alcohol': 'alcohol',
 'perc_expend': 'perc_expend',
 'measles': 'measles',
 'bmi': 'bmi',
 'under_five_deaths': 'under_five_deaths',
 'polio': 'polio',
 'tot_expend': 'tot_expend',
 'diphtheria': 'diphtheria',
 'hiv_aids': 'hiv_aids',
 'gdp': 'gdp',
 'pop': 'pop',
 'thinness__1_19_years': 'thinness__1_19_years',
 'thinness_5_9_years': 'thinness_5_9_years',
 'income_comp': 'income_comp',
 'schooling': 'schooling',
 'Afghanistan': 'afghanistan',
 'Albania': 'albania',
 'Algeria': 'algeria',
 'Antigua and Barbuda': 'antigua_and_barbuda',
 'Argentina': 'argentina',
 'Armenia': 'armenia',
 'Australia': 'australia',
 'Austria': 'austria',
 'Azerbaijan': 'azerbaijan',
 'Bahamas': 'bahamas',
 'Bahrain': 'bahrain',
 'Bangladesh': 'bangladesh',
 'Barbados': 'barbados',
 'Belarus': 'belarus',
 'Belgium': 'belgium',
 'Belize': 'belize',
 'Benin': 'benin',
 'Bhutan': 'bhutan',
 'Bolivia

In [65]:
features.rename(columns=rename_dict, inplace=True)

In [66]:
shorten = {'life_expectancy':'lifex', \
           'percentage_expenditure':'perc_expend', \
           'total_expenditure':'tot_expend', \
          'population':'pop','income_composition_of_resources':'income_comp'}

In [67]:
features.rename(columns=shorten, inplace=True)

In [68]:
features

Unnamed: 0,year,status,adult_mortality,infant_deaths,alcohol,perc_expend,measles,bmi,under_five_deaths,polio,tot_expend,diphtheria,hiv_aids,gdp,pop,thinness__1_19_years,thinness_5_9_years,income_comp,schooling,afghanistan,albania,algeria,antigua_and_barbuda,argentina,armenia,australia,austria,azerbaijan,bahamas,bahrain,bangladesh,barbados,belarus,belgium,belize,benin,bhutan,bolivia__plurinational_state_of_,bosnia_and_herzegovina,botswana,brazil,brunei_darussalam,bulgaria,burkina_faso,burundi,cabo_verde,cambodia,cameroon,canada,central_african_republic,chad,chile,china,colombia,comoros,congo,costa_rica,croatia,cuba,cyprus,czechia,côte_d_ivoire,democratic_people_s_republic_of_korea,democratic_republic_of_the_congo,denmark,djibouti,dominican_republic,ecuador,egypt,el_salvador,equatorial_guinea,eritrea,estonia,ethiopia,fiji,finland,france,gabon,gambia,georgia,germany,ghana,greece,grenada,guatemala,guinea,guinea_bissau,guyana,haiti,honduras,hungary,iceland,india,indonesia,iran__islamic_republic_of_,iraq,ireland,israel,italy,jamaica,japan,jordan,kazakhstan,kenya,kiribati,kuwait,kyrgyzstan,lao_people_s_democratic_republic,latvia,lebanon,lesotho,liberia,libya,lithuania,luxembourg,madagascar,malawi,malaysia,maldives,mali,malta,mauritania,mauritius,mexico,micronesia__federated_states_of_,mongolia,montenegro,morocco,mozambique,myanmar,namibia,nepal,netherlands,new_zealand,nicaragua,niger,nigeria,norway,oman,pakistan,panama,papua_new_guinea,paraguay,peru,philippines,poland,portugal,qatar,republic_of_korea,republic_of_moldova,romania,russian_federation,rwanda,saint_lucia,saint_vincent_and_the_grenadines,samoa,sao_tome_and_principe,saudi_arabia,senegal,serbia,seychelles,sierra_leone,singapore,slovakia,slovenia,solomon_islands,somalia,south_africa,spain,sri_lanka,suriname,swaziland,sweden,switzerland,syrian_arab_republic,tajikistan,thailand,the_former_yugoslav_republic_of_macedonia,timor_leste,togo,tonga,trinidad_and_tobago,tunisia,turkey,turkmenistan,uganda,ukraine,united_arab_emirates,united_kingdom_of_great_britain_and_northern_ireland,united_republic_of_tanzania,united_states_of_america,uruguay,uzbekistan,vanuatu,venezuela__bolivarian_republic_of_,viet_nam,yemen,zambia,zimbabwe
0,2015,0,263.0,62,0.01,71.279624,1154,19.1,83,6.0,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2014,0,271.0,64,0.01,73.523582,492,18.6,86,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2013,0,268.0,66,0.01,73.219243,430,18.1,89,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.470,9.9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2012,0,272.0,69,0.01,78.184215,2787,17.6,93,67.0,8.52,67.0,0.1,669.959000,3696958.0,17.9,18.0,0.463,9.8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2011,0,275.0,71,0.01,7.097109,3013,17.2,97,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,2004,0,723.0,27,4.36,0.000000,31,27.1,42,67.0,7.13,65.0,33.6,454.366654,12777511.0,9.4,9.4,0.407,9.2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2934,2003,0,715.0,26,4.06,0.000000,998,26.7,41,7.0,6.52,68.0,36.7,453.351155,12633897.0,9.8,9.9,0.418,9.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2935,2002,0,73.0,25,4.43,0.000000,304,26.3,40,73.0,6.53,71.0,39.8,57.348340,125525.0,1.2,1.3,0.427,10.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2936,2001,0,686.0,25,1.72,0.000000,529,25.9,39,76.0,6.16,75.0,42.1,548.587312,12366165.0,1.6,1.7,0.427,9.8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [69]:
features_with_life = features.copy()

In [70]:
features_with_life['lifex'] = df3.lifex

In [71]:
linreg = LinearRegression()
selector = RFE(linreg, n_features_to_select=199)
selector = selector.fit(features, target)

In [72]:
i = selector.ranking_
zipped_rankings = list(zip(i, features))
best_predictors_dum = []
for i in zipped_rankings:
    if i[0] == 1:
        best_predictors_dum.append(i[1])

In [73]:
formula = '+'.join(best_predictors_dum)

In [74]:
best_predictors_dum

['year',
 'status',
 'adult_mortality',
 'infant_deaths',
 'alcohol',
 'perc_expend',
 'measles',
 'bmi',
 'under_five_deaths',
 'polio',
 'tot_expend',
 'diphtheria',
 'hiv_aids',
 'gdp',
 'pop',
 'thinness__1_19_years',
 'thinness_5_9_years',
 'income_comp',
 'schooling',
 'afghanistan',
 'albania',
 'algeria',
 'antigua_and_barbuda',
 'argentina',
 'armenia',
 'australia',
 'austria',
 'azerbaijan',
 'bahamas',
 'bahrain',
 'bangladesh',
 'barbados',
 'belarus',
 'belgium',
 'belize',
 'benin',
 'bhutan',
 'bolivia__plurinational_state_of_',
 'bosnia_and_herzegovina',
 'botswana',
 'brazil',
 'brunei_darussalam',
 'bulgaria',
 'burkina_faso',
 'burundi',
 'cabo_verde',
 'cambodia',
 'cameroon',
 'canada',
 'central_african_republic',
 'chad',
 'chile',
 'china',
 'colombia',
 'comoros',
 'congo',
 'costa_rica',
 'croatia',
 'cuba',
 'cyprus',
 'czechia',
 'côte_d_ivoire',
 'democratic_people_s_republic_of_korea',
 'democratic_republic_of_the_congo',
 'denmark',
 'djibouti',
 'domini

#### Train test on model dummies

In [75]:
formula

'year+status+adult_mortality+infant_deaths+alcohol+perc_expend+measles+bmi+under_five_deaths+polio+tot_expend+diphtheria+hiv_aids+gdp+pop+thinness__1_19_years+thinness_5_9_years+income_comp+schooling+afghanistan+albania+algeria+antigua_and_barbuda+argentina+armenia+australia+austria+azerbaijan+bahamas+bahrain+bangladesh+barbados+belarus+belgium+belize+benin+bhutan+bolivia__plurinational_state_of_+bosnia_and_herzegovina+botswana+brazil+brunei_darussalam+bulgaria+burkina_faso+burundi+cabo_verde+cambodia+cameroon+canada+central_african_republic+chad+chile+china+colombia+comoros+congo+costa_rica+croatia+cuba+cyprus+czechia+côte_d_ivoire+democratic_people_s_republic_of_korea+democratic_republic_of_the_congo+denmark+djibouti+dominican_republic+ecuador+egypt+el_salvador+equatorial_guinea+eritrea+estonia+ethiopia+fiji+finland+france+gabon+gambia+georgia+germany+ghana+greece+grenada+guatemala+guinea+guinea_bissau+guyana+haiti+honduras+hungary+iceland+india+indonesia+iran__islamic_republic_of_+i

In [76]:
model = ols(formula = f'lifex~{formula}', data=features_with_life).fit()

In [77]:
model.summary()

0,1,2,3
Dep. Variable:,lifex,R-squared:,0.964
Model:,OLS,Adj. R-squared:,0.961
Method:,Least Squares,F-statistic:,361.2
Date:,"Mon, 24 Feb 2020",Prob (F-statistic):,0.0
Time:,14:21:55,Log-Likelihood:,-5806.9
No. Observations:,2888,AIC:,12010.0
Df Residuals:,2689,BIC:,13200.0
Df Model:,198,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-468.4540,21.375,-21.916,0.000,-510.368,-426.540
year,0.2584,0.011,24.088,0.000,0.237,0.279
status,25.9201,0.611,42.399,0.000,24.721,27.119
adult_mortality,-0.0020,0.000,-4.398,0.000,-0.003,-0.001
infant_deaths,0.0855,0.011,7.728,0.000,0.064,0.107
alcohol,-0.0577,0.023,-2.489,0.013,-0.103,-0.012
perc_expend,0.0001,5e-05,2.731,0.006,3.85e-05,0.000
measles,-1.164e-05,4.31e-06,-2.701,0.007,-2.01e-05,-3.19e-06
bmi,-0.0034,0.003,-1.167,0.243,-0.009,0.002

0,1,2,3
Omnibus:,777.386,Durbin-Watson:,1.135
Prob(Omnibus):,0.0,Jarque-Bera (JB):,20132.364
Skew:,0.689,Prob(JB):,0.0
Kurtosis:,15.861,Cond. No.,3.04e+21


In [78]:
features_dum = features[best_predictors_dum]

In [79]:
X_train, X_test, y_train, y_test = train_test_split(features_dum, target, random_state=2, test_size=0.2)

In [80]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [81]:
def run_model(model, X_train, X_test, y_train, y_test):
    price_std = target.std()
    print('Training R^2 :', model.score(X_train, y_train))
    y_pred_train = model.predict(X_train)
    train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_pred_train))
    print('Training Root Mean Square Error', train_rmse)
    print('Training Root Mean Square Error Standardized', train_rmse/price_std)
    print('\n----------------\n')
    print('Testing R^2 :', model.score(X_test, y_test))
    y_pred_test = model.predict(X_test)
    test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))
    print('Testing Root Mean Square Error', test_rmse)
    print('Training Root Mean Square Error Standardized', test_rmse/price_std)

#### Model Results

In [82]:
run_model(model, X_train, X_test, y_train, y_test)

Training R^2 : 0.9667355273146165
Training Root Mean Square Error 1.7223779349314652
Training Root Mean Square Error Standardized 0.18138998733068135

----------------

Testing R^2 : 0.9486844150946028
Testing Root Mean Square Error 2.194733385393005
Training Root Mean Square Error Standardized 0.23113548594461142


In [83]:
features_with_life.lifex.describe()

count    2888.000000
mean       69.349377
std         9.495441
min        36.300000
25%        63.475000
50%        72.200000
75%        75.800000
max        89.000000
Name: lifex, dtype: float64