#### Go back to root folder

In [1]:
%cd ..

d:\Projects\Coding\Python\academic\MLOps\Life Expectancy


# Phase 1. Setup & Data Loading

In [2]:
import pandas as pd

df = pd.read_csv(r'data\Life-Expectancy-Data-Updated.csv')

In [3]:
# checking shape: no. of (rows, columns)
df.shape

(2864, 21)

Dataset has: 
2938 rows and 22 columns (1 target, 21 features)

In [4]:
# check data types of all the features
df.dtypes

Country                         object
Region                          object
Year                             int64
Infant_deaths                  float64
Under_five_deaths              float64
Adult_mortality                float64
Alcohol_consumption            float64
Hepatitis_B                      int64
Measles                          int64
BMI                            float64
Polio                            int64
Diphtheria                       int64
Incidents_HIV                  float64
GDP_per_capita                   int64
Population_mln                 float64
Thinness_ten_nineteen_years    float64
Thinness_five_nine_years       float64
Schooling                      float64
Economy_status_Developed         int64
Economy_status_Developing        int64
Life_expectancy                float64
dtype: object


* `Country` and `Region` do not provide any value toward prediction. They are just identifiers so we drop them.  
* Each country defines itself as `Economy_status_Developed` or `Economy_status_Developing`, so it's not a reliable estimator.

In [5]:
# dropping identifiers
df = df.drop(['Country', 'Region', 'Year', 'Economy_status_Developed', 'Economy_status_Developing'], axis=1)
df.head()

Unnamed: 0,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Life_expectancy
0,11.1,13.0,105.824,1.32,97,65,27.8,97,97,0.08,11006,78.53,4.9,4.8,7.8,76.5
1,2.7,3.3,57.9025,10.35,97,94,26.0,97,97,0.09,25742,46.44,0.6,0.5,9.7,82.8
2,51.5,67.9,201.0765,1.57,60,35,21.2,67,64,0.13,1076,1183.21,27.1,28.0,5.0,65.4
3,32.8,40.5,222.1965,5.68,93,74,25.3,92,93,0.79,4146,0.75,5.7,5.5,7.9,67.0
4,3.4,4.3,57.951,2.89,97,89,27.0,94,94,0.08,33995,7.91,1.2,1.1,12.8,81.7


---
##### Target Variable: `Life expectancy`
---

In [6]:
# seperate features and target columns
x = df.drop('Life_expectancy', axis=1)
y = df['Life_expectancy']

In [7]:
# cross-check feature matrix to see if target variable has been dropped
x

Unnamed: 0,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling
0,11.1,13.0,105.8240,1.320,97,65,27.8,97,97,0.08,11006,78.53,4.9,4.8,7.8
1,2.7,3.3,57.9025,10.350,97,94,26.0,97,97,0.09,25742,46.44,0.6,0.5,9.7
2,51.5,67.9,201.0765,1.570,60,35,21.2,67,64,0.13,1076,1183.21,27.1,28.0,5.0
3,32.8,40.5,222.1965,5.680,93,74,25.3,92,93,0.79,4146,0.75,5.7,5.5,7.9
4,3.4,4.3,57.9510,2.890,97,89,27.0,94,94,0.08,33995,7.91,1.2,1.1,12.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2859,97.0,224.9,291.8240,0.092,72,64,20.8,41,34,0.49,399,11.33,12.8,12.9,1.1
2860,23.9,28.6,235.2330,6.560,97,97,25.3,96,95,0.02,2515,2.67,2.2,2.3,9.1
2861,17.7,28.9,134.8950,1.560,62,95,21.9,97,97,0.02,2130,19.39,15.4,15.5,10.3
2862,7.9,9.9,204.0120,11.000,94,95,26.1,97,95,0.05,7424,3.44,3.3,3.3,11.1


# Phase 3. Preprocessing and Feature Engineering


### 3.1 Split the Data into Train-Test Splits

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True, test_size=0.2, random_state=64)

### 3.2 Scaling the dataset

In [9]:
# StandardScaler becuase numeric datatypes
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train) # becomes an array
x_test_scaled = scaler.transform(x_test)

# convert back to DataFrame
x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_train.columns, index=x_train.index)
x_test_scaled = pd.DataFrame(x_test_scaled, columns=x_test.columns, index=x_test.index)
x_train_scaled

Unnamed: 0,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling
2583,-0.440774,-0.474668,-0.099959,1.060557,0.848538,0.991377,-0.105674,0.825295,0.813171,-0.355628,-0.412512,-0.102460,-0.198893,-0.138653,0.714239
1415,-0.837406,-0.773452,-0.454289,1.015241,0.537418,0.669047,0.260451,0.691663,0.360601,-0.372429,-0.406729,-0.210577,-0.537745,-0.535860,0.809308
2236,-0.646435,-0.635377,-0.608821,-1.071768,0.910762,1.098820,0.214685,0.825295,0.813171,-0.355628,0.530977,-0.247120,0.501400,0.413023,-0.553343
887,1.351414,1.118848,0.526122,-0.263650,-0.520389,-0.727716,-0.471799,-0.444203,-0.673847,1.093488,-0.634176,-0.233308,0.433629,0.390955,-1.028686
2068,-0.139628,-0.246053,-0.223110,-1.164915,-1.142628,-1.103768,1.541888,-0.778282,-1.191071,-0.322025,-0.398201,-0.025347,0.094778,0.037883,-0.331516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1009,0.029308,-0.105714,-0.403521,-0.787290,-0.831508,-0.673995,0.763872,-0.110125,-0.091970,-0.343027,-0.295985,0.191030,0.094778,0.059950,-0.680101
54,-0.903511,-0.818722,-0.721082,1.559022,0.724090,1.098820,0.535044,0.624848,0.619212,-0.368229,0.020361,-0.232877,-0.695876,-0.690329,0.714239
2167,2.728608,2.943243,1.296022,-1.205195,-2.636002,-0.727716,-1.158284,-3.450908,-3.324620,-0.322025,-0.665074,-0.188709,0.591760,0.523358,-1.757546
998,1.733356,1.490065,0.752841,-1.201167,-1.142628,-3.252634,-1.295581,-1.914148,-1.837601,-0.368229,-0.658007,-0.079370,3.257392,3.215536,-1.599098


### 3.3 Checking for Multicolinearity via VIF


Verify findings from the heatmap with `VIF` (Variance Inflation Factor).

**VIF criteria**
| VIF  | Meaning               |
| ---- | --------------------- |
| 1–5  | OK                    |
| ~ 5  | moderate collinearity |
| > 10 | problem               |

In [10]:

# variance inflation factor (VIF)
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = [variance_inflation_factor(x_train_scaled.values, i) for i in range(x_train_scaled.shape[1])]
vif_df = pd.DataFrame({'Variable': x_train_scaled.columns, 'VIF': vif})

vif_df

Unnamed: 0,Variable,VIF
0,Infant_deaths,42.539269
1,Under_five_deaths,42.469692
2,Adult_mortality,7.498083
3,Alcohol_consumption,1.927322
4,Hepatitis_B,2.621862
5,Measles,1.590894
6,BMI,2.642062
7,Polio,12.64185
8,Diphtheria,14.06368
9,Incidents_HIV,2.877361


Highly Collinear pairs:
* `Infant_deaths` and `Under_five_deaths`
* `Polio` AND `Diphtheria`
* `Thinness_five_nine_years` and `Thinness_ten_nineteen_years`

Others (VIF>5):
* `Adult_mortality`

If we proceed by dropping a feature from each pair, we would loose valuable information.  
So, we will combine them.
* **AVG**: taking a simple average will suffice as all three pairs of features represent one category such as:
    * `Polio` AND `Diphtheria` -> `avg_vaccinations`
    * `Thinness_five_nine_years` and `Thinness_ten_nineteen_years` -> `avg_child_thinness` (covers 5-19 years)
* **PCA**: we will loose interpretability as it is hard to explain the impact of principle components.
    * PCA is used when accrurate predictions takes higher priority than interpretability.

We drop `Infant_deaths` as it's a mathematical subset of `Under_five_death`

### Handling Multicollinearity

In [None]:
# avg the features
df['vaccine_index'] = (df['Polio'] + df['Diphtheria']) / 2
df['Thinness_Index'] = (df['Thinness_ten_nineteen_years'] + df['Thinness_five_nine_years']) / 2

# drop features
df = df.drop(['Polio', 'Diphtheria', 'Thinness_ten_nineteen_years', 'Thinness_five_nine_years', 'Infant_deaths'], axis=1)
df

<bound method NDFrame.head of       Under_five_deaths  Adult_mortality  Alcohol_consumption  Hepatitis_B  \
0                  13.0         105.8240                1.320           97   
1                   3.3          57.9025               10.350           97   
2                  67.9         201.0765                1.570           60   
3                  40.5         222.1965                5.680           93   
4                   4.3          57.9510                2.890           97   
...                 ...              ...                  ...          ...   
2859              224.9         291.8240                0.092           72   
2860               28.6         235.2330                6.560           97   
2861               28.9         134.8950                1.560           62   
2862                9.9         204.0120               11.000           94   
2863                2.6          50.5745                6.840           88   

      Measles   BMI  Incidents_HI

In [12]:
# define feature matrix X after the changes
x = df.drop('Life_expectancy', axis=1)
y = df['Life_expectancy']

In [13]:
# train test split on new data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=64)

# scale the values
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_train.columns, index=x_train.index)
x_test_scaled = pd.DataFrame(x_test_scaled, columns=x_test.columns, index=x_test.index)

### Re-calculate VIF

In [14]:

# variance inflation factor (VIF)
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = [variance_inflation_factor(x_train_scaled.values, i) for i in range(x_train_scaled.shape[1])]
vif_df = pd.DataFrame({'Variable': x_train_scaled.columns, 'VIF': vif})

vif_df

Unnamed: 0,Variable,VIF
0,Under_five_deaths,7.747027
1,Adult_mortality,7.409039
2,Alcohol_consumption,1.848373
3,Hepatitis_B,2.469954
4,Measles,1.556384
5,BMI,2.606899
6,Incidents_HIV,2.844514
7,GDP_per_capita,1.911197
8,Population_mln,1.135165
9,Schooling,4.13206


### Handling the Remaining Collinearity

Collinear pairs: `Adult_mortality` and `Under_five_deaths`  
  
Usually, VIF ~ 7 is accepted depending on the importance and relevance of the feature.  
  
* Otion 1:  
    What if we keep it?
    * R-square will be high

But, If we are trying to understand the factors affecting Health, then we must remove both of these variables.
  
    > Cos, it's pretty abvious that if death rate is high then "Life expectancy" will be low and vice-versa.  
    
* **PCA:**
    * If we absolutely need to increase the r-square. 
    * PCA retains the variance provided by these variables.

In [15]:
# dropping
df = df.drop(['Adult_mortality', 'Under_five_deaths'], axis=1)

x = df.drop('Life_expectancy', axis=1)
y = df['Life_expectancy']

In [27]:
# redifine
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=64)

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

x_train_scaled = pd.DataFrame(x_train, columns = x_train.columns, index = x_train.index)
x_test_scaled = pd.DataFrame(x_test, columns = x_test.columns, index = x_test.index)

LESGOOO! (^-^)

## 5. Feature Selection

In [28]:
# total number of remaining features present

print(f'Total no. of remaining features: {x_train_scaled.columns.value_counts().sum()}')
print(f'features: {x_train_scaled.columns}')

Total no. of remaining features: 10
features: Index(['Alcohol_consumption', 'Hepatitis_B', 'Measles', 'BMI', 'Incidents_HIV',
       'GDP_per_capita', 'Population_mln', 'Schooling', 'vaccine_index',
       'Thinness_Index'],
      dtype='object')


In [29]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

model = LinearRegression()

# Forward selection
forward_selector = SequentialFeatureSelector(
    model,
    n_features_to_select='auto',
    direction='forward',
    cv=5,
    n_jobs=-1
)

forward_selector.fit(x_train_scaled, y_train)

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,-1

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [30]:
# backward elimination
backward_eliminator = SequentialFeatureSelector(
    model,
    n_features_to_select='auto',
    direction='backward',
    cv=5,
    n_jobs=-1
)
backward_eliminator.fit(x_train_scaled, y_train)

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'backward'
,scoring,
,cv,5
,n_jobs,-1

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [32]:
# checking variables selected through different methods

forward_features = forward_selector.get_feature_names_out()
backward_features = backward_eliminator.get_feature_names_out()

print(f'Forward Features: {forward_features}')
print(f'Backward Features: {backward_features}')

Forward Features: ['BMI' 'Incidents_HIV' 'GDP_per_capita' 'Schooling' 'vaccine_index']
Backward Features: ['BMI' 'Incidents_HIV' 'GDP_per_capita' 'Schooling' 'vaccine_index']


In [33]:
# define selected features
selected_features = forward_features
print(f'Selected Features: {selected_features}')

Selected Features: ['BMI' 'Incidents_HIV' 'GDP_per_capita' 'Schooling' 'vaccine_index']


### Update Train and Test Data with Selected Features

In [34]:
x_train_selected = x_train[selected_features]
x_test_selected = x_test[selected_features]

print(f'x_train_selected: {x_train_selected.shape}')
print(f'x_test_selected: {x_test_selected.shape}')

x_train_selected: (2291, 5)
x_test_selected: (573, 5)


### Create a new scaler with the new features

In [35]:
# new scaler instance
scaler_final = StandardScaler()

# scale selected features
x_train_scaled = scaler_final.fit_transform(x_train_selected)
x_test_scaled = scaler_final.transform(x_test_selected)

# convert to df to retain column names
x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_train_selected.columns, index=x_train_selected.index)
x_test_scaled = pd.DataFrame(x_test_scaled, columns=x_test_selected.columns, index=x_test_selected.index)

## 6. Model Training & Evaluation

In [36]:
# initialize the model
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [37]:
# train the model
model.fit(x_train_scaled, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [38]:
# generate predictions
y_pred = model.predict(x_test_scaled)

In [39]:
# evaluate performance
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

mae = mean_absolute_error(y_pred, y_test)
rmse = root_mean_squared_error(y_pred, y_test)
r2 = r2_score(y_pred, y_test)

print('Evaluation Reports:')
print(f'Mean Absolute Error (MAE): {mae:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
print(f'R-squared (r2) / Coefficient of Determination : {r2:.2f}')

Evaluation Reports:
Mean Absolute Error (MAE): 3.36
Root Mean Squared Error (RMSE): 4.09
R-squared (r2) / Coefficient of Determination : 0.77


## 7. Store the Model

In [57]:
# store as pickle
import pickle

# create a package (dictionary) to store model and scaler
package = {
    'scaler': scaler_final,
    'model': model
}

with open('model/lifeexp_linreg.pkl', 'wb') as f:
    pickle.dump(package, f)

## 8. Load the Model and Test

In [58]:
# load the model
with open('lifeexp_linreg.pkl', 'rb') as f:
    package = pickle.load(f)

scaler_new = package['scaler']
model_new = package['model']

In [59]:
# Profile 1: High Life Expectancy (e.g., Belgium, Japan, Germany)
high_life_expectancy_profile = {
    'BMI': 55.4,              # Developed nations have higher average BMI
    'Incidents_HIV': 0.1,     # Very low HIV incidence
    'GDP_per_capita': 42000.0,# High economic output
    'Schooling': 19.2,        # High years of schooling
    'vaccine_index': 98.5     # Near perfect vaccination coverage (avg of Polio/Diphtheria)
}

# Profile 2: Low Life Expectancy (e.g., Sierra Leone, Malawi in early 2000s)
low_life_expectancy_profile = {
    'BMI': 22.1,              # Lower BMI often indicates undernutrition
    'Incidents_HIV': 9.4,     # High HIV incidence impacts life expectancy severely
    'GDP_per_capita': 480.0,  # Low economic resources
    'Schooling': 6.5,         # Fewer years of education
    'vaccine_index': 62.0     # Inconsistent vaccination coverage
}

sample_1_df = pd.DataFrame([high_life_expectancy_profile])
sample_2_df = pd.DataFrame([low_life_expectancy_profile])

sample_1_scaled = scaler_new.transform(sample_1_df)
sample_2_scaled = scaler_new.transform(sample_2_df)

In [60]:
# test the model
y_pred_1 = model_new.predict(sample_1_scaled)
y_pred_2 = model_new.predict(sample_2_scaled)

print(f'Real: High Dev Country; Predicted Life Expectancy: {y_pred_1} years')
print(f'Real: High Low Country; Predicted Life Expectancy: {y_pred_2} years')

Real: High Dev Country; Predicted Life Expectancy: [106.27131528] years
Real: High Low Country; Predicted Life Expectancy: [46.72500744] years




> **NOTE** Warning cos I didn't convert the scale values back to df.

In [61]:
# Create a dataframe to view coefficients clearly
coef_df = pd.DataFrame({
    'Feature': selected_features,
    'Coefficient': model.coef_
})
print(f"Intercept: {model.intercept_}")
print(coef_df)

Intercept: 68.87481449148844
          Feature  Coefficient
0             BMI     1.550745
1   Incidents_HIV    -3.722752
2  GDP_per_capita     2.023827
3       Schooling     2.394788
4   vaccine_index     2.814026


In [62]:
print('Model Coefficients:')
print(selected_features, model.coef_, " + ", model.intercept_)

Model Coefficients:
['BMI' 'Incidents_HIV' 'GDP_per_capita' 'Schooling' 'vaccine_index'] [ 1.55074489 -3.72275195  2.02382749  2.39478773  2.81402558]  +  68.87481449148844


In [63]:
print('Linear regression Model Equation:')
print(f'{model.intercept_}', end=' ')

for i in range(len(selected_features)):
    print(f'+ ({model.coef_[i]:.2f} * {selected_features[i]})', end=' ')

Linear regression Model Equation:
68.87481449148844 + (1.55 * BMI) + (-3.72 * Incidents_HIV) + (2.02 * GDP_per_capita) + (2.39 * Schooling) + (2.81 * vaccine_index) 