# Split and normalize the dataset to fit and test

In [1]:
import pandas as pd
from sklearn import preprocessing, linear_model
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb


In [2]:
df = pd.read_csv('cleaned_csv.csv')

df.columns

Index(['Year', 'Life expectancy', 'Adult Mortality', 'infant deaths',
       'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles', 'BMI',
       'under-five deaths',
       ...
       'Country-United Republic of Tanzania',
       'Country-United States of America', 'Country-Uruguay',
       'Country-Uzbekistan', 'Country-Vanuatu',
       'Country-Venezuela (Bolivarian Republic of)', 'Country-Viet Nam',
       'Country-Yemen', 'Country-Zambia', 'Country-Zimbabwe'],
      dtype='object', length=203)

## Split the dataset
The target column is 'Life expectancy'.

In [3]:
X = df.drop(['Life expectancy'], axis=1)
Y = df['Life expectancy']
X_train , X_test , Y_train , Y_test = train_test_split(X,Y, test_size = 0.2 , random_state = 1)

## Scale the dataset

In [4]:
y_scaler = preprocessing.MinMaxScaler()
x_scaler = preprocessing.MinMaxScaler()
X_train_scaled = x_scaler.fit_transform(X_train.values)
y_train_scaled = y_scaler.fit_transform(Y_train.values.reshape(-1, 1))
X_test_scaled = x_scaler.fit_transform(X_test.values)
y_test_scaled = y_scaler.fit_transform(Y_test.values.reshape(-1, 1))

## Models

### Lasso Regression

In [5]:
lasso_r = Lasso(alpha=0.01)
lasso_r.fit(X_train_scaled, y_train_scaled)
predictions = lasso_r.predict(X_test_scaled)

#### Test Lasso Regression

In [6]:
print('R2-squared:', lasso_r.score(X_test_scaled, y_test_scaled))
print('mean squared error:', mean_squared_error(y_test_scaled, predictions))
print('root squared mean squared error:', math.sqrt(mean_squared_error(y_test_scaled, predictions)))

R2-squared: 0.5713522254205673
mean squared error: 0.015184796365601667
root squared mean squared error: 0.1232266057537968


#### Test Lasso Regression using original scale

In [7]:
ypred = y_scaler.inverse_transform(predictions.reshape(-1, 1))
print(Y_test.shape)
ypred = ypred.reshape(Y_test.shape[0],)
print(ypred.shape)
print('mean squared error:', mean_squared_error(Y_test, ypred))
print('root squared mean squared error:', math.sqrt(mean_squared_error(Y_test, ypred)))

(586,)
(586,)
mean squared error: 37.961990914004176
root squared mean squared error: 6.16133028768984


### Ridge Regression

In [8]:
ridge_r = Ridge(alpha=0.01, random_state=938)
ridge_r.fit(X_train_scaled, y_train_scaled)
predictions = ridge_r.predict(X_test_scaled)

#### Test Ridge Regression

In [9]:
print('R2-squared:', ridge_r.score(X_test_scaled, y_test_scaled))
print('mean squared error:', mean_squared_error(y_test_scaled, predictions))
print('root squared mean squared error:', math.sqrt(mean_squared_error(y_test_scaled, predictions)))

R2-squared: 0.9476750828822106
mean squared error: 0.001853603957375453
root squared mean squared error: 0.04305350110473541


#### Test Ridge Regression using original scale

In [10]:
ypred = y_scaler.inverse_transform(predictions.reshape(-1, 1))
print(Y_test.shape)
ypred = ypred.reshape(Y_test.shape[0],)
print(ypred.shape)
print('mean squared error:', mean_squared_error(Y_test, ypred))
print('root squared mean squared error:', math.sqrt(mean_squared_error(Y_test, ypred)))

(586,)
(586,)
mean squared error: 4.634009893438632
root squared mean squared error: 2.15267505523677


### Random Forest

In [11]:
rf_model = RandomForestRegressor(n_estimators = 10, random_state = 123)
rf_model = rf_model.fit(X_train_scaled, y_train_scaled.values.ravel())
predictions = rf_model.predict(X_test_scaled)

  rf_model = rf_model.fit(X_train_scaled, y_train_scaled)


#### Test Random Forest

In [12]:
print('R2-squared:', rf_model.score(X_test_scaled, y_test_scaled))
print('mean squared error:', mean_squared_error(y_test_scaled, predictions))
print('root squared mean squared error:', math.sqrt(mean_squared_error(y_test_scaled, predictions)))

R2-squared: 0.943333932887834
mean squared error: 0.002007388678926423
root squared mean squared error: 0.04480389133687411


#### Test Random Forest using original scale

In [13]:
ypred = y_scaler.inverse_transform(predictions.reshape(-1, 1))
print(Y_test.shape)
ypred = ypred.reshape(Y_test.shape[0],)
print(ypred.shape)
print('mean squared error:', mean_squared_error(Y_test, ypred))
print('root squared mean squared error:', math.sqrt(mean_squared_error(Y_test, ypred)))

(586,)
(586,)
mean squared error: 5.018471697316057
root squared mean squared error: 2.2401945668437055


### LightGBM

In [14]:
gbm = lgb.LGBMRegressor()
gbm.fit(X_train_scaled, y_train_scaled.values.ravel())
predictions = gbm.predict(X_test_scaled)

  return f(**kwargs)


#### Test LightGBM

In [15]:
print('R2-squared:', gbm.score(X_test_scaled, y_test_scaled))
print('mean squared error:', mean_squared_error(y_test_scaled, predictions))
print('root squared mean squared error:', math.sqrt(mean_squared_error(y_test_scaled, predictions)))

R2-squared: 0.9470764760932004
mean squared error: 0.0018748095315863627
root squared mean squared error: 0.043299070793567415


#### Test LightGBM using original scale

In [16]:
ypred = y_scaler.inverse_transform(predictions.reshape(-1, 1))
print(Y_test.shape)
ypred = ypred.reshape(Y_test.shape[0],)
print(ypred.shape)
print('mean squared error:', mean_squared_error(Y_test, ypred))
print('root squared mean squared error:', math.sqrt(mean_squared_error(Y_test, ypred)))

(586,)
(586,)
mean squared error: 4.687023828965906
root squared mean squared error: 2.1649535396783706
