### Model Training

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/My Drive/Colab Notebooks/Insurance-Premium-Prediction

/content/drive/My Drive/Colab Notebooks/Insurance-Premium-Prediction


#### Importing Pandas, Numpy, Matplotlib, Seaborn and Warnings Library.

In [3]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#### Import the CSV Data as Pandas DataFrame

In [4]:
df = pd.read_csv("./insurance.csv")

#### Show Top 5 Records

In [5]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


### Preprocessing

In [6]:
# Obtaining list of numeric columns and of categorical columns
numeric_columns = df.select_dtypes(include=np.number).columns.tolist()
cat_columns = df.select_dtypes(include= object).columns.tolist()
numeric_columns.remove('expenses')

In [7]:
from sklearn.preprocessing import LabelEncoder

for col in cat_columns:

    le = LabelEncoder()
    df[col] = le.fit_transform(df[col],)
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(le_name_mapping)

{'female': np.int64(0), 'male': np.int64(1)}
{'no': np.int64(0), 'yes': np.int64(1)}
{'northeast': np.int64(0), 'northwest': np.int64(1), 'southeast': np.int64(2), 'southwest': np.int64(3)}


In [8]:
# Performing normalization on all numeric columns

from sklearn.preprocessing import MinMaxScaler

scaler_age = MinMaxScaler()
df['age'] = scaler_age.fit_transform(df['age'].to_numpy().reshape(-1, 1))

scaler_bmi = MinMaxScaler()
df['bmi'] = scaler_bmi.fit_transform(df['bmi'].to_numpy().reshape(-1, 1))

scaler_children = MinMaxScaler()
df['children'] = scaler_children.fit_transform(df['children'].to_numpy().reshape(-1, 1))

### Modelling

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Split data into features (X) and target (y)
X = df.drop(columns=["expenses"])
y = df["expenses"]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize models
linear_reg = LinearRegression()

boosting_reg = GradientBoostingRegressor()
xgb_reg = XGBRegressor()

# Fit models to training data
linear_reg.fit(X_train, y_train)

boosting_reg.fit(X_train, y_train)
xgb_reg.fit(X_train, y_train)

# Predictions on train data
y_train_pred_linear = linear_reg.predict(X_train)

y_train_pred_boosting = boosting_reg.predict(X_train)
y_train_pred_xgb= xgb_reg.predict(X_train)

# Predictions on test data
y_pred_linear = linear_reg.predict(X_test)
\
y_pred_boosting = boosting_reg.predict(X_test)
y_pred_xgb = xgb_reg.predict(X_test)


# Evaluate performance on Linear Regression model
mse_linear = mean_squared_error(y_test, y_pred_linear)
mae_linear = mean_absolute_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)


mse_linear_train = mean_squared_error(y_train, y_train_pred_linear)
mae_linear_train = mean_absolute_error(y_train, y_train_pred_linear)
r2_linear_train = r2_score(y_train, y_train_pred_linear)



# Evaluate performance on Gradient Boosting Model
mse_boosting = mean_squared_error(y_test, y_pred_boosting)
mae_boosting = mean_absolute_error(y_test, y_pred_boosting)
r2_boosting = r2_score(y_test, y_pred_boosting)

mse_boosting_train = mean_squared_error(y_train, y_train_pred_boosting)
mae_boosting_train = mean_absolute_error(y_train, y_train_pred_boosting)
r2_boosting_train = r2_score(y_train, y_train_pred_boosting)

# Evaluate performance on XGBoost Model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
mae_xgb= mean_absolute_error(y_test, y_pred_xgb)
r2_xgb= r2_score(y_test, y_pred_xgb)

mse_xgb_train = mean_squared_error(y_train, y_train_pred_xgb)
mae_xgb_train = mean_absolute_error(y_train, y_train_pred_xgb)
r2_xgb_train = r2_score(y_train, y_train_pred_xgb)

print(f"Linear Regression - Train - MSE: {mse_linear_train:.4f}, MAE: {mae_linear_train:.4f}, r2-score: {r2_linear_train:.4f}")

print(f"Gradient Boosting Regression Train - MSE: {mse_boosting_train:.4f}, MAE: {mae_boosting_train:.4f}, r2-score: {r2_boosting_train:.4f}")
print(f"XGBoost Regressor Train - MSE: {mse_xgb_train:.4f}, MAE: {mae_xgb_train:.4f}, r2-score: {r2_xgb_train:.4f}")
print()
print(f"Linear Regression - Test - MSE: {mse_linear:.4f}, MAE: {mae_linear:.4f}, r2-score: {r2_linear:.4f}")
print(f"Gradient Boosting Regression - Test - MSE: {mse_boosting:.4f}, MAE: {mae_boosting:.4f}, r2-score: {r2_boosting}")
print(f"XGBoost Regressor - Test - MSE: {mse_xgb:.4f}, MAE: {mae_xgb:.4f}, r2-score: {r2_xgb}")

Linear Regression - Train - MSE: 37749430.5017, MAE: 4251.2193, r2-score: 0.7423
Gradient Boosting Regression Train - MSE: 14385421.0301, MAE: 2099.2990, r2-score: 0.9018
XGBoost Regressor Train - MSE: 529622.6415, MAE: 434.5469, r2-score: 0.9964

Linear Regression - Test - MSE: 33802383.5549, MAE: 4154.7004, r2-score: 0.7695
Gradient Boosting Regression - Test - MSE: 19854401.1184, MAE: 2514.8970, r2-score: 0.8645899765708828
XGBoost Regressor - Test - MSE: 26176911.8287, MAE: 2909.2587, r2-score: 0.8214694957107828


### Hyperparameter Tuning using GridSearch

In [10]:
# Fine tuning XGBoost Regressor
from sklearn.model_selection import GridSearchCV

param_grid =  {'max_depth': [5,10, 15, 20, 25, 30],'learning_rate':[0.001, 0.01, 0.1, 0.5], 'n_estimators': [100,150,200, 250, 300], 'subsample': [0.6, 0.7, 0.8] }

gbr = XGBRegressor(objective='reg:squarederror', random_state=42)

grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, scoring= 'neg_mean_squared_error', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best Negative Mean Squared Error", grid_search.best_score_)

Best parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.6}
Best Negative Mean Squared Error -23068741.4292893


In [11]:
best_params = grid_search.best_params_

best_params['random_state'] = 42

In [12]:
xgb_reg_best = XGBRegressor(**best_params)

xgb_reg_best.fit(X_train, y_train)

y_pred_xgb_best = xgb_reg_best.predict(X_test)

mse_xgb_best = mean_squared_error(y_test, y_pred_xgb_best)
mae_xgb_best = mean_absolute_error(y_test, y_pred_xgb_best)
r2_xgb_best = r2_score(y_test, y_pred_xgb_best)

y_train_pred_xgb_best = xgb_reg_best.predict(X_train)

mse_xgb_train_best = mean_squared_error(y_train, y_train_pred_xgb_best)
mae_xgb_train_best = mean_absolute_error(y_train, y_train_pred_xgb_best)
r2_xgb_train_best = r2_score(y_train, y_train_pred_xgb_best)

print(f"Best XGBoost Regressor Train - MSE: {mse_xgb_train_best:.4f}, MAE: {mae_xgb_train_best:.4f}, R2_score: {r2_xgb_train_best:.4f}")

print(f"Best XGBoost Regressor - Test - MSE: {mse_xgb_best:.4f}, MAE: {mae_xgb_best:.4f}, R2_score: {r2_xgb_best:.4f}")

Best XGBoost Regressor Train - MSE: 14398094.6348, MAE: 2241.2945, R2_score: 0.9017
Best XGBoost Regressor - Test - MSE: 18835527.2565, MAE: 2523.1229, R2_score: 0.8715


#### Saving Best Performing Model

In [13]:
import joblib

# Save the model
model_filename = "xgb_reg_model_best.pkl"
joblib.dump(xgb_reg_best, model_filename)
print(f"Model saved as {model_filename}")

Model saved as xgb_reg_model_best.pkl


#### Getting predictions for new unseen data.

In [14]:
# Load the model
loaded_model = joblib.load("xgb_reg_model_best.pkl")

# Take user input (features) for prediction
user_input = {
    "age": float(input("Enter age: ")),
    "sex": int(input("Enter sex (0 for female, 1 for male): ")),
    "bmi": float(input("Enter BMI: ")),
    "children": int(input("Enter number of children: ")),
    "smoker": int(input("Enter smoker status (0 for no, 1 for yes): ")),
    "region": int(input("Enter region (0 for northeast, 1 for northwest, 2 for southeast, 3 for southwest): "))
}

# Convert user input to a DataFrame
user_df = pd.DataFrame([user_input])

# Normalize numerical columns

user_df['age'] = scaler_age.transform(user_df['age'].to_numpy().reshape(-1, 1))

user_df['bmi'] = scaler_bmi.transform(user_df['bmi'].to_numpy().reshape(-1, 1))

user_df['children'] = scaler_children.transform(user_df['children'].to_numpy().reshape(-1, 1))

# Make predictions
predicted_expenses = xgb_reg_best.predict(user_df)
print(f"Predicted expenses: ${predicted_expenses[0]:.2f}")

Enter age: 34
Enter sex (0 for female, 1 for male): 1
Enter BMI: 22
Enter number of children: 2
Enter smoker status (0 for no, 1 for yes): 1
Enter region (0 for northeast, 1 for northwest, 2 for southeast, 3 for southwest): 2
Predicted expenses: $17793.15
