In [2]:
# importing packages

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [8]:
# uploading dataframe from EDA

df = pd.read_csv('Cleaned_Life_Expectancy_Data.csv')

In [9]:
df

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,1154,19.1,83,6.0,8.16,65.0,0.1,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,492,18.6,86,58.0,8.18,62.0,0.1,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,430,18.1,89,62.0,8.13,64.0,0.1,17.7,17.7,0.470,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,2787,17.6,93,67.0,8.52,67.0,0.1,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,3013,17.2,97,68.0,7.87,68.0,0.1,18.2,18.2,0.454,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2883,Zimbabwe,2004,Developing,44.3,723.0,27,4.36,0.000000,31,27.1,42,67.0,7.13,65.0,33.6,9.4,9.4,0.407,9.2
2884,Zimbabwe,2003,Developing,44.5,715.0,26,4.06,0.000000,998,26.7,41,7.0,6.52,68.0,36.7,9.8,9.9,0.418,9.5
2885,Zimbabwe,2002,Developing,44.8,73.0,25,4.43,0.000000,304,26.3,40,73.0,6.53,71.0,39.8,1.2,1.3,0.427,10.0
2886,Zimbabwe,2001,Developing,45.3,686.0,25,1.72,0.000000,529,25.9,39,76.0,6.16,75.0,42.1,1.6,1.7,0.427,9.8


# Test/Train Split

In [11]:
# defining our features and target
feature_cols = list(df.columns)
feature_cols.remove('Life expectancy ')

In [12]:
# defining our features and target
X = df[feature_cols]
y = df['Life expectancy ']

In [13]:
# test/train split with 80% train size/20% test size
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 155)

In [14]:
X_train.head()

Unnamed: 0,Country,Year,Status,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
1519,Libya,2000,Developing,148.0,3,0.01,457.320224,0,52.8,3,94.0,3.41,94.0,0.1,5.7,5.5,0.727,15.5
1336,Jordan,2007,Developing,12.0,4,0.6,373.262864,41,59.3,4,98.0,8.35,98.0,0.1,3.8,3.8,0.736,13.6
717,Democratic People's Republic of Korea,2002,Developing,167.0,14,3.08,0.0,0,26.2,18,99.0,4.33,64.0,0.1,5.9,5.9,0.587465,11.352032
704,Democratic People's Republic of Korea,2015,Developing,139.0,6,0.01,0.0,0,32.9,7,99.0,4.33,96.0,0.1,4.9,4.9,0.587465,11.352032
2257,Serbia,2008,Developing,132.0,1,9.54,96.332379,2,55.8,1,95.0,1.5,95.0,0.1,2.3,2.4,0.749,13.5


# Feature Engineering (Complex) - Train

In [22]:
# feature engineering complex model
def feature_eng(df):
        df = df.copy()

        df['Status'] = df['Status'].map({'Developed':1, 'Developing':0})

        df.drop(columns = ['Country', 'Year', 'infant deaths', 'Measles ', 'Total expenditure'], inplace = True)

        return df

In [23]:
# applying feature engineering on X_train
X_train_fe = feature_eng(X_train)

In [24]:
# Scaling on X_train, using Robust scaler
rob = RobustScaler().set_output(transform="pandas")
rob.fit(X_train_fe)
X_train_fe = rob.transform(X_train_fe)

In [25]:
# checking data types of X_train
X_train.dtypes

Country                             object
Year                                 int64
Status                              object
Adult Mortality                    float64
infant deaths                        int64
Alcohol                            float64
percentage expenditure             float64
Measles                              int64
 BMI                               float64
under-five deaths                    int64
Polio                              float64
Total expenditure                  float64
Diphtheria                         float64
 HIV/AIDS                          float64
 thinness  1-19 years              float64
 thinness 5-9 years                float64
Income composition of resources    float64
Schooling                          float64
dtype: object

In [26]:
# Checking columns in X_train_fe
X_train_fe.columns

Index(['Status', 'Adult Mortality', 'Alcohol', 'percentage expenditure',
       ' BMI ', 'under-five deaths ', 'Polio', 'Diphtheria ', ' HIV/AIDS',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling'],
      dtype='object')

In [27]:
X_train_fe.head()

Unnamed: 0,Status,Adult Mortality,Alcohol,percentage expenditure,BMI,under-five deaths,Polio,Diphtheria,HIV/AIDS,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
1519,0.0,0.039735,-0.515786,0.851139,0.251513,-0.037037,0.052632,0.052632,0.0,0.436364,0.385965,0.19403,0.820513
1336,0.0,-0.860927,-0.432068,0.668051,0.426362,0.0,0.263158,0.263158,0.0,0.090909,0.087719,0.227612,0.333333
717,0.0,0.165563,-0.08017,-0.144963,-0.464022,0.518519,0.315789,-1.526316,0.0,0.472727,0.45614,-0.326623,-0.243069
704,0.0,-0.019868,-0.515786,-0.144963,-0.283793,0.111111,0.315789,0.157895,0.0,0.290909,0.280702,-0.326623,-0.243069
2257,0.0,-0.066225,0.836467,0.064861,0.332213,-0.111111,0.105263,0.105263,0.0,-0.181818,-0.157895,0.276119,0.307692


In [28]:
# initialising, fitting and predicting linear regression using SK learn

lin_reg = LinearRegression()

lin_reg.fit(X_train_fe, y_train)

y_pred = lin_reg.predict(X_train_fe)

metrics.mean_squared_error(y_train, y_pred, squared=False)

4.064691430035102

# Feature Engineering (Complex) - Test

In [29]:
# applying feature engineering to X_test
X_test_fe = feature_eng(X_test)

In [30]:
# applying scaling to X_test_fe
X_test_fe = rob.transform(X_test_fe)

In [31]:
X_test_fe.head()

Unnamed: 0,Status,Adult Mortality,Alcohol,percentage expenditure,BMI,under-five deaths,Polio,Diphtheria,HIV/AIDS,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
308,0.0,0.370861,0.04044,-0.144963,0.173504,0.259259,0.105263,0.105263,0.166667,-0.381818,-0.385965,-0.097015,0.384615
916,0.0,-0.370861,0.874778,0.00972,-0.991258,-0.148148,0.315789,0.315789,0.0,-0.436364,-0.438596,0.757463,1.179487
538,0.0,-0.635762,-0.457609,-0.125499,-0.754539,2.777778,-2.947368,-3.578947,8.5,-0.327273,-0.333333,-1.376866,-1.717949
1536,1.0,-0.523179,1.06066,-0.144963,0.480161,-0.148148,0.315789,0.315789,0.0,-0.418182,-0.421053,0.824627,0.410256
129,1.0,-0.503311,1.230933,18.042827,0.367182,-0.148148,0.263158,0.263158,0.0,-0.272727,-0.22807,0.809701,0.923077


In [33]:
# Calculating the RMSE of the complex model

y_pred_test = lin_reg.predict(X_test_fe)

RMSE = metrics.mean_squared_error(y_test, y_pred_test, squared=False)

print(f'The RMSE of our complex model {RMSE}')

The RMSE of our complex model 4.180115338556166


# Feature Engineering (Simple/Data Sensitive) - Train

In [40]:
# checking the columns of the the data frame
df.columns

Index(['Country', 'Year', 'Status', 'Life expectancy ', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Measles ',
       ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure',
       'Diphtheria ', ' HIV/AIDS', ' thinness  1-19 years',
       ' thinness 5-9 years', 'Income composition of resources', 'Schooling'],
      dtype='object')

In [41]:
# feature engineering the simple model

def feature_eng_simple(df):
        df = df.copy()

        df['Status'] = df['Status'].map({'Developed':1, 'Developing':0}).astype(float)

        df.drop(columns = ['Country', 'Year', 'infant deaths', 'Measles ', 'Total expenditure',' BMI ', 'Adult Mortality', 'Alcohol', 'percentage expenditure', 'under-five deaths ', 'Polio', 'Diphtheria ', ' HIV/AIDS', ' thinness  1-19 years', ' thinness 5-9 years'], inplace = True)

        return df

In [42]:
# applying feature engineering to X_train
X_train_fe_simple = feature_eng_simple(X_train)

In [43]:
# applying scaling to X_train_fe_simple with Robust Scaler
rob = RobustScaler().set_output(transform="pandas")
rob.fit(X_train_fe_simple)
X_train_fe_simple = rob.transform(X_train_fe_simple)

In [44]:
X_train_fe_simple.head()

Unnamed: 0,Status,Income composition of resources,Schooling
1519,0.0,0.19403,0.820513
1336,0.0,0.227612,0.333333
717,0.0,-0.326623,-0.243069
704,0.0,-0.326623,-0.243069
2257,0.0,0.276119,0.307692


In [45]:
# initialising, fitting and predicting linear regression using SK learn

lin_reg_simple = LinearRegression()

lin_reg_simple.fit(X_train_fe_simple, y_train)

y_pred_simple = lin_reg_simple.predict(X_train_fe_simple)

metrics.mean_squared_error(y_train, y_pred_simple, squared=False)

6.066801951438528

# Feature Engineering (Simple/Data Sensitive) - Test

In [46]:
X_test_fe_simple = feature_eng_simple(X_test)

In [47]:
X_test_fe_simple = rob.transform(X_test_fe_simple)

In [48]:
X_test_fe_simple.head()

Unnamed: 0,Status,Income composition of resources,Schooling
308,0.0,-0.097015,0.384615
916,0.0,0.757463,1.179487
538,0.0,-1.376866,-1.717949
1536,1.0,0.824627,0.410256
129,1.0,0.809701,0.923077


In [49]:
y_pred_test_simple = lin_reg_simple.predict(X_test_fe_simple)

RMSE_simple = metrics.mean_squared_error(y_test, y_pred_test_simple, squared=False)

print(f'The RMSE of our simple model {RMSE_simple}')

The RMSE of our simple model 5.9307621234637615


# Making a copy

In [None]:
FEATURE ENGINEERING - WHO PROJECT GROUP 3.ipynb