Please note that the following code is inspired from https://www.kaggle.com/code/burningdzire/life-expectancy-who-linear-regression

In [1]:
# imports
import numpy as np
import joblib
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd


In [2]:
# load data
data = pd.read_csv("le_cleaned.csv")
data.sort_values('Country', inplace=True) # just for clarity's sake
data.head(5)

Unnamed: 0,Country,Year,Status,Life_expectancy,Adult_mortality,Infant_deaths,Alcohol,Percentage_expenditure,HepatitisB,Under_five_deaths,Polio,Total_expenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness_1-19_years,Thinness_5-9_years,Income_composition_of_resources,Schooling
321,Afghanistan,2013,Developing,59.9,268.0,66,0.01,4.293458,81.0,89,79.0,8.13,65.0,0.1,6.448486,31731688.0,12.0,12.1,0.47,9.9
1887,Afghanistan,2003,Developing,56.7,295.0,87,0.01,2.405958,81.0,122,79.0,8.82,65.0,0.1,5.29194,31731688.0,12.0,12.1,0.373,6.5
160,Afghanistan,2014,Developing,59.9,271.0,64,0.01,4.297606,81.0,86,79.0,8.18,65.0,0.1,6.41787,31731688.0,12.0,12.1,0.476,10.0
1744,Afghanistan,2004,Developing,57.0,293.0,87,0.02,2.727596,81.0,120,79.0,8.79,65.0,0.1,5.389717,31731688.0,12.0,12.1,0.381,6.8
1745,Albania,2004,Developing,73.0,17.0,1,4.54,5.401969,99.0,1,98.0,6.38,97.0,0.1,7.790112,2992547.0,1.8,1.9,0.681,10.9


In [3]:
# replace country with numbers
data['Country'] = data['Country'].replace(['Afghanistan' , 'Albania' , 'Algeria' , 'Angola' , 'Antigua and Barbuda' , 'Argentina' , 'Armenia' , 'Australia' , 'Austria' , 'Azerbaijan' , 'Bahamas' , 'Bahrain' , 'Bangladesh' , 'Barbados' , 'Belarus' , 'Belgium' , 'Belize' , 'Benin' , 'Bhutan' , 'Bolivia (Plurinational State of)' , 'Bosnia and Herzegovina' , 'Botswana' , 'Brazil' , 'Brunei Darussalam' , 'Bulgaria' , 'Burkina Faso' , 'Burundi' , "Côte d'Ivoire" , 'Cabo Verde' , 'Cambodia' , 'Cameroon' , 'Canada' , 'Central African Republic' , 'Chad' , 'Chile' , 'China' , 'Colombia' , 'Comoros' , 'Congo' , 'Costa Rica' , 'Croatia' , 'Cuba' , 'Cyprus' , 'Czechia' , "Democratic People's Republic of Korea" , 'Democratic Republic of the Congo' , 'Denmark' , 'Djibouti' , 'Dominican Republic' , 'Ecuador' , 'Egypt' , 'El Salvador' , 'Equatorial Guinea' , 'Eritrea' , 'Estonia' , 'Ethiopia' , 'Fiji' , 'Finland' , 'France' , 'Gabon' , 'Gambia' , 'Georgia' , 'Germany' , 'Ghana' , 'Greece' , 'Grenada' , 'Guatemala' , 'Guinea' , 'Guinea-Bissau' , 'Guyana' , 'Haiti' , 'Honduras' , 'Hungary' , 'Iceland' , 'India' , 'Indonesia' , 'Iran (Islamic Republic of)' , 'Iraq' , 'Ireland' , 'Israel' , 'Italy' , 'Jamaica' , 'Japan' , 'Jordan' , 'Kazakhstan' , 'Kenya' , 'Kiribati' , 'Kuwait' , 'Kyrgyzstan' , "Lao People's Democratic Republic" , 'Latvia' , 'Lebanon' , 'Lesotho' , 'Liberia' , 'Libya' , 'Lithuania' , 'Luxembourg' , 'Madagascar' , 'Malawi' , 'Malaysia' , 'Maldives' , 'Mali' , 'Malta' , 'Mauritania' , 'Mauritius' , 'Mexico' , 'Micronesia (Federated States of)' , 'Mongolia' , 'Montenegro' , 'Morocco' , 'Mozambique' , 'Myanmar' , 'Namibia' , 'Nepal' , 'Netherlands' , 'New Zealand' , 'Nicaragua' , 'Niger' , 'Nigeria' , 'Norway' , 'Oman' , 'Pakistan' , 'Panama' , 'Papua New Guinea' , 'Paraguay' , 'Peru' , 'Philippines' , 'Poland' , 'Portugal' , 'Qatar' , 'Republic of Korea' , 'Republic of Moldova' , 'Romania' , 'Russian Federation' , 'Rwanda' , 'Saint Lucia' , 'Saint Vincent and the Grenadines' , 'Samoa' , 'Sao Tome and Principe' , 'Saudi Arabia' , 'Senegal' , 'Serbia' , 'Seychelles' , 'Sierra Leone' , 'Singapore' , 'Slovakia' , 'Slovenia' , 'Solomon Islands' , 'Somalia' , 'South Africa' , 'South Sudan' , 'Spain' , 'Sri Lanka' , 'Sudan' , 'Suriname' , 'Swaziland' , 'Sweden' , 'Switzerland' , 'Syrian Arab Republic' , 'Tajikistan' , 'Thailand' , 'The former Yugoslav republic of Macedonia' , 'Timor-Leste' , 'Togo' , 'Tonga' , 'Trinidad and Tobago' , 'Tunisia' , 'Turkey' , 'Turkmenistan' , 'Uganda' , 'Ukraine' , 'United Arab Emirates' , 'United Kingdom of Great Britain and Northern Ireland' , 'United Republic of Tanzania' , 'United States of America' , 'Uruguay' , 'Uzbekistan' , 'Vanuatu' , 'Venezuela (Bolivarian Republic of)' , 'Viet Nam' , 'Yemen' , 'Zambia' , 'Zimbabwe' , 'Cook Islands' , 'Dominica' , 'Marshall Islands' , 'Monaco' , 'Nauru' , 'Niue' , 'Palau' , 'Saint Kitts and Nevis' , 'San Marino' , 'Tuvalu'], [1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,21 ,22 ,23 ,24 ,25 ,26 ,27 ,28 ,29 ,30 ,31 ,32 ,33 ,34 ,35 ,36 ,37 ,38 ,39 ,40 ,41 ,42 ,43 ,44 ,45 ,46 ,47 ,48 ,49 ,50 ,51 ,52 ,53 ,54 ,55 ,56 ,57 ,58 ,59 ,60 ,61 ,62 ,63 ,64 ,65 ,66 ,67 ,68 ,69 ,70 ,71 ,72 ,73 ,74 ,75 ,76 ,77 ,78 ,79 ,80 ,81 ,82 ,83 ,84 ,85 ,86 ,87 ,88 ,89 ,90 ,91 ,92 ,93 ,94 ,95 ,96 ,97 ,98 ,99 ,100 ,101 ,102 ,103 ,104 ,105 ,106 ,107 ,108 ,109 ,110 ,111 ,112 ,113 ,114 ,115 ,116 ,117 ,118 ,119 ,120 ,121 ,122 ,123 ,124 ,125 ,126 ,127 ,128 ,129 ,130 ,131 ,132 ,133 ,134 ,135 ,136 ,137 ,138 ,139 ,140 ,141 ,142 ,143 ,144 ,145 ,146 ,147 ,148 ,149 ,150 ,151 ,152 ,153 ,154 ,155 ,156 ,157 ,158 ,159 ,160 ,161 ,162 ,163 ,164 ,165 ,166 ,167 ,168 ,169 ,170 ,171 ,172 ,173 ,174 ,175 ,176 ,177 ,178 ,179 ,180 ,181 ,182 ,183 ,184 ,185 ,186 ,187 ,188 ,189 ,190 ,191 ,192 ,193])

# replace development status with numbers
data['Status'] = data['Status'].replace(['Developing', 'Developed'], [1,2])

# let's remove 2015 from the dataset so we can use it to make sure we're not overfitting later on
data_2015 = data.loc[data['Year'] == 2015]
data = data.loc[data['Year'] != 2015]

data_2015.head()
data.head()

Unnamed: 0,Country,Year,Status,Life_expectancy,Adult_mortality,Infant_deaths,Alcohol,Percentage_expenditure,HepatitisB,Under_five_deaths,Polio,Total_expenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness_1-19_years,Thinness_5-9_years,Income_composition_of_resources,Schooling
321,1,2013,1,59.9,268.0,66,0.01,4.293458,81.0,89,79.0,8.13,65.0,0.1,6.448486,31731688.0,12.0,12.1,0.47,9.9
1887,1,2003,1,56.7,295.0,87,0.01,2.405958,81.0,122,79.0,8.82,65.0,0.1,5.29194,31731688.0,12.0,12.1,0.373,6.5
160,1,2014,1,59.9,271.0,64,0.01,4.297606,81.0,86,79.0,8.18,65.0,0.1,6.41787,31731688.0,12.0,12.1,0.476,10.0
1744,1,2004,1,57.0,293.0,87,0.02,2.727596,81.0,120,79.0,8.79,65.0,0.1,5.389717,31731688.0,12.0,12.1,0.381,6.8
1745,2,2004,1,73.0,17.0,1,4.54,5.401969,99.0,1,98.0,6.38,97.0,0.1,7.790112,2992547.0,1.8,1.9,0.681,10.9


In [4]:
# divide data
train, test = train_test_split(data, test_size=0.2, random_state=0)

# ensure that we have a roughly even distribution of categorical variables
# (note we aren't really using country name as a categorical variable since there are so many of them)
print(train['Status'].value_counts())
print(test['Status'].value_counts())

1    1442
2     360
Name: Status, dtype: int64
1    366
2     85
Name: Status, dtype: int64


In [5]:
# split train/test datasets into label and features
train_x = train.loc[:, train.columns != 'Life_expectancy']
test_x = test.loc[:, test.columns != 'Life_expectancy']

train_y = train['Life_expectancy']
test_y = test['Life_expectancy']

In [6]:
lr_model = linear_model.LinearRegression()
lr_model.fit(train_x, train_y)

print(lr_model.coef_)

[-1.15380831e-03  3.93484707e-02  1.52304849e+00 -1.65486782e-02
  9.41348007e-02  1.62895893e-03  1.87735307e-01 -8.97127366e-02
 -9.38151929e-02  6.91362202e-02  5.18875104e-02  5.50711772e-02
 -8.80388169e+00  9.35260261e-02  8.01227125e-09  2.14170288e-01
 -4.57167916e-01  1.24179050e+01  1.14795082e-01]


In [7]:
lr_test_pred = lr_model.predict(test_x)
print(mean_squared_error(test_y, lr_test_pred))
print(r2_score(test_y, lr_test_pred) * 100)

14.587708762603366
78.31163574837227


Some notes:
* 14% mean squared error is pretty good, probably no overfitting yet
* r2 score of 78% isn't the best

Let's see how the Random Forest Regressor model fares compared to Linear Regression

In [8]:
# more imports
from sklearn.ensemble import RandomForestRegressor

In [9]:
# only need to define new regressor since we already cleaned and split data
rfr_model = RandomForestRegressor(n_estimators=100, random_state=0)
rfr_model.fit(train_x, train_y)

RandomForestRegressor(random_state=0)

In [10]:
rfr_test_pred = rfr_model.predict(test_x)
print(mean_squared_error(test_y, rfr_test_pred))
print(r2_score(test_y, rfr_test_pred) * 100)

3.406113146341456
94.93594067428258


Some notes:
* 3% MSE is quite good
* 95% r2 score is very very good

Overall, it seems better than linear regression. However, MSE makes me think we may be overfitting... Let's take a look at the year of data we set aside (2015)

In [11]:
# divide data for double check
data_2015_x = data_2015.loc[:, data_2015.columns != 'Life_expectancy']
data_2015_y = data_2015['Life_expectancy']

rfr_pred_2015 = rfr_model.predict(data_2015_x)
print(mean_squared_error(data_2015_y, rfr_pred_2015))
print(r2_score(data_2015_y, rfr_pred_2015) * 100)


4.215621618750001
93.33680545155454


Some notes:
* Looks like the Random Forest Regressor model performed just as well on the year of data we set aside as it did on the original training set. No overfitting (hopefully)!
* Given that it's performance is also better than the Linear Regression model, we'll move forward with that when developing the program

Logic for saving and loading the model for use in an application follows. Please note that the logic is inspired from: https://mljar.com/blog/save-load-random-forest/

In [12]:
# Let's save the Random Forest model to a file to use in python script
joblib.dump(rfr_model, "./country_le_rf_model.joblib")

['./country_le_rf_model.joblib']

In [15]:
# Let's test loading the model from a file to ensure that it works
loaded_rf = joblib.load("./country_le_rf_model.joblib")

pred = loaded_rf.predict(data_2015_x)
print(mean_squared_error(data_2015_y, pred))
print(r2_score(data_2015_y, pred) * 100)

4.215621618750001
93.33680545155454


Looks right to me!