## Training

In [2]:
%store -r df

In [3]:
import sklearn as sk

## a Linear Regression

In [4]:
from sklearn.linear_model import LinearRegression

In [5]:
model = LinearRegression()

In [6]:
df.columns

Index(['Country', 'Year', 'Life expectancy ', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure',
       'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling', 'Status_Developed',
       'Status_Developing'],
      dtype='object')

In [7]:
X = df.drop(['Country', 'Life expectancy '], axis=1)

In [8]:
y = df['Life expectancy ']

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [11]:
model.fit(X_train, y_train)

In [12]:
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

In [13]:
from sklearn.metrics import mean_squared_error

mse_test = mean_squared_error(y_test, y_pred_test)
print(f"Test MSE: {mse_test:.4f}")


Test MSE: 12.0188


> Test MSE before transformation is 14.2255 

> After transformation: 13.2045

> After scaling: 12.0188

In [14]:
mse_train = mean_squared_error(y_train, y_pred_train)
print(f"Train MSE: {mse_train:.4f}")

Train MSE: 13.8513


> Train MSE before transformation is 15.4997

> After transformation: 15.3956

> After scaling: 13.8513

In [15]:
from sklearn.metrics import r2_score

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"Train R² Score: {r2_train:.4f}")
print(f"Test R² Score: {r2_test:.4f}")

Train R² Score: 0.8484
Test R² Score: 0.8613


> Before transformation: Train R² Score: 0.8304, Test R² Score: 0.8359

> After transformation: Train R² Score: 0.8304, Test R² Score: 0.8359

> After scaling: Train R² Score: 0.8484, Test R² Score: 0.8613

In [16]:
from sklearn.metrics import mean_absolute_error

mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)

print(f"Train MAE: {mae_train:.4f}")
print(f"Test MAE: {mae_test:.4f}")

Train MAE: 2.7678
Test MAE: 2.5973


> After scaling: Train MAE: 2.7678, Test MAE: 2.5973

In [17]:
from sklearn.metrics import mean_squared_error

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

print(f"Train MSE: {mse_train:.4f}")
print(f"Test MSE: {mse_test:.4f}")


Train MSE: 13.8513
Test MSE: 12.0188


> before transformation: Train MSE: 15.4997, Test MSE: 14.2255

> after transformation: Train MSE: 15.3956, Test MSE: 13.2045

> After scaling: Train MSE: 13.8513, Test MSE: 12.0188

## Random Forest

In [18]:
from sklearn.ensemble import RandomForestRegressor

In [19]:
rf = RandomForestRegressor()

In [20]:
rf.fit(X_train, y_train)

In [21]:
y_pred = rf.predict(X_test)

In [22]:
y_pred_test_rf = rf.predict(X_test)
y_pred_train_rf = rf.predict(X_train)

In [23]:
# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# R-squared (R²) score
r2 = r2_score(y_test, y_pred)
print(f'R² Score: {r2}')

Mean Squared Error: 2.693833658163258
R² Score: 0.9689156570419928


In [24]:
from sklearn.metrics import r2_score

r2_train = r2_score(y_train, y_pred_train_rf)
r2_test = r2_score(y_test, y_pred_test_rf)

print(f"Train R² Score: {r2_train:.4f}")
print(f"Test R² Score: {r2_test:.4f}")

Train R² Score: 0.9942
Test R² Score: 0.9689


In [25]:
from sklearn.metrics import mean_absolute_error

mae_train = mean_absolute_error(y_train, y_pred_train_rf)
mae_test = mean_absolute_error(y_test, y_pred_test_rf)

print(f"Train MAE: {mae_train:.4f}")
print(f"Test MAE: {mae_test:.4f}")

Train MAE: 0.4462
Test MAE: 1.0845


In [26]:
from sklearn.metrics import mean_squared_error

mse_train = mean_squared_error(y_train, y_pred_train_rf)
mse_test = mean_squared_error(y_test, y_pred_test_rf)

print(f"Train MSE: {mse_train:.4f}")
print(f"Test MSE: {mse_test:.4f}")


Train MSE: 0.5284
Test MSE: 2.6938


In [30]:
from sklearn.dummy import DummyRegressor

baseline = DummyRegressor(strategy="mean")
baseline.fit(X_train, y_train)
y_baseline_pred = baseline.predict(X_test)
baseline_mae = mean_absolute_error(y_test, y_baseline_pred)
print(f"Baseline MAE: {baseline_mae}")

Baseline MAE: 7.650446519033146


- Save model for HyperTuning

In [29]:
import joblib

# Assuming 'best_model' is the best one you identified
joblib.dump(rf, "/home/davidkibet/Desktop/Life Expectancy ML/models/rf_model.pkl") 

# Save training and test sets
joblib.dump(X_train, "/home/davidkibet/Desktop/Life Expectancy ML/models/train_test_sets/X_train.pkl")
joblib.dump(X_test, "/home/davidkibet/Desktop/Life Expectancy ML/models/train_test_sets/X_test.pkl")
joblib.dump(y_train, "/home/davidkibet/Desktop/Life Expectancy ML/models/train_test_sets/y_train.pkl")
joblib.dump(y_test, "/home/davidkibet/Desktop/Life Expectancy ML/models/train_test_sets/y_test.pkl")

['/home/davidkibet/Desktop/Life Expectancy ML/models/train_test_sets/y_test.pkl']