### Model Training

#### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

#### Import the CSV Data as Pandas DataFrame

In [13]:
df = pd.read_csv("C:/Users/91999/Desktop/VSCode/Insurance-Premium-Prediction/notebook/data/insurance.csv")

#### Show Top 5 Records

In [14]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


### Preprocessing

In [4]:
# Obtaining list of numeric columns and of categorical columns
numeric_columns = df.select_dtypes(include=np.number).columns.tolist()
cat_columns = df.select_dtypes(include= object).columns.tolist()
numeric_columns.remove('expenses')

In [5]:
from sklearn.preprocessing import LabelEncoder

for col in cat_columns:
    
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col],)
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(le_name_mapping)

{'female': 0, 'male': 1}
{'no': 0, 'yes': 1}
{'northeast': 0, 'northwest': 1, 'southeast': 2, 'southwest': 3}


In [6]:
# Performing normalization on all numeric columns

from sklearn.preprocessing import MinMaxScaler

scaler_age = MinMaxScaler()
df['age'] = scaler_age.fit_transform(df['age'].to_numpy().reshape(-1, 1))

scaler_bmi = MinMaxScaler()
df['bmi'] = scaler_bmi.fit_transform(df['bmi'].to_numpy().reshape(-1, 1))

scaler_children = MinMaxScaler()
df['children'] = scaler_children.fit_transform(df['children'].to_numpy().reshape(-1, 1))

### Modelling

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Split data into features (X) and target (y)
X = df.drop(columns=["expenses"])  # Replace "target_column" with your actual target column
y = df["expenses"]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize models
linear_reg = LinearRegression()
tree_reg = DecisionTreeRegressor()
boosting_reg = GradientBoostingRegressor()

# Fit models to training data
linear_reg.fit(X_train, y_train)
tree_reg.fit(X_train, y_train)
boosting_reg.fit(X_train, y_train)

# Predictions on train data
y_train_pred_linear = linear_reg.predict(X_train)
y_train_pred_tree = tree_reg.predict(X_train)
y_train_pred_boosting = boosting_reg.predict(X_train)


# Predictions on test data
y_pred_linear = linear_reg.predict(X_test)
y_pred_tree = tree_reg.predict(X_test)
y_pred_boosting = boosting_reg.predict(X_test)

# Evaluate performance on Linear Regression model
mse_linear = mean_squared_error(y_test, y_pred_linear)
mae_linear = mean_absolute_error(y_test, y_pred_linear)

mse_linear_train = mean_squared_error(y_train, y_train_pred_linear)
mae_linear_train = mean_absolute_error(y_train, y_train_pred_linear)

# Evaluate performance on Decision Tree Model
mse_tree = mean_squared_error(y_test, y_pred_tree)
mae_tree = mean_absolute_error(y_test, y_pred_tree)

mse_tree_train = mean_squared_error(y_train, y_train_pred_tree)
mae_tree_train = mean_absolute_error(y_train, y_train_pred_tree)

# Evaluate performance on Gradient Boosting Model
mse_boosting = mean_squared_error(y_test, y_pred_boosting)
mae_boosting = mean_absolute_error(y_test, y_pred_boosting)

mse_boosting_train = mean_squared_error(y_train, y_train_pred_boosting)
mae_boosting_train = mean_absolute_error(y_train, y_train_pred_boosting)


print(f"Linear Regression - Train - MSE: {mse_linear_train:.4f}, MAE: {mae_linear_train:.4f}")
print(f"Decision Tree Regression Train - MSE: {mse_tree_train:.4f}, MAE: {mae_tree_train:.4f}")
print(f"Gradient Boosting Regression Train - MSE: {mse_boosting_train:.4f}, MAE: {mae_boosting_train:.4f}")
print("")
print(f"Linear Regression - Test - MSE: {mse_linear:.4f}, MAE: {mae_linear:.4f}")
print(f"Decision Tree Regression - Test - MSE: {mse_tree:.4f}, MAE: {mae_tree:.4f}")
print(f"Gradient Boosting Regression - Test - MSE: {mse_boosting:.4f}, MAE: {mae_boosting:.4f}")

Linear Regression - Train - MSE: 37749430.5017, MAE: 4251.2193
Decision Tree Regression Train - MSE: 0.0000, MAE: 0.0000
Gradient Boosting Regression Train - MSE: 14385421.0301, MAE: 2099.2990

Linear Regression - Test - MSE: 33802383.5549, MAE: 4154.7004
Decision Tree Regression - Test - MSE: 41743394.7115, MAE: 2943.5257
Gradient Boosting Regression - Test - MSE: 19911832.1148, MAE: 2524.5173


### Hyperparameter Tuning using GridSearch

In [8]:
# Fine tuning Gradient Boositng Regressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
param_grid =  {'max_depth': [5,10, 15, 20, 25, 30],'learning_rate':[0.001, 0.01, 0.1, 0.5], 'n_estimators': [100,150,200, 250, 300], 'subsample': [0.6, 0.7, 0.8] }

gbr = XGBRegressor(objective='reg:squarederror', random_state=42)

grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best negative MSE score:", grid_search.best_score_)

Best parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.6}
Best negative MSE score: -23679971.247744087


In [9]:
best_params = grid_search.best_params_

best_params['objective'] = 'reg:squarederror'

best_params['random_state'] = 42

In [10]:
boosting_reg_best = XGBRegressor(**best_params)

boosting_reg_best.fit(X_train, y_train)

y_pred_boosting_best = boosting_reg_best.predict(X_test)

mse_boosting_best = mean_squared_error(y_test, y_pred_boosting_best)
mae_boosting_best = mean_absolute_error(y_test, y_pred_boosting_best)
r2_boosting_best = r2_score(y_test, y_pred_boosting_best)

y_train_pred_boosting_best = boosting_reg_best.predict(X_train)

mse_boosting_train_best = mean_squared_error(y_train, y_train_pred_boosting_best)
mae_boosting_train_best = mean_absolute_error(y_train, y_train_pred_boosting_best)
r2_boosting_train_best = r2_score(y_train, y_train_pred_boosting_best) 

print(f"Best Gradient Boosting Regression Train - MSE: {mse_boosting_train_best:.4f}, MAE: {mae_boosting_train_best:.4f}, R2_score: {r2_boosting_train_best:.4f}")
                                 
print(f"Best Gradient Boosting Regression - Test - MSE: {mse_boosting_best:.4f}, MAE: {mae_boosting_best:.4f}, R2_score: {r2_boosting_best:.4f}") 

Best Gradient Boosting Regression Train - MSE: 15484631.4151, MAE: 1975.4561, R2_score: 0.8943
Best Gradient Boosting Regression - Test - MSE: 18834723.7576, MAE: 2098.2237, R2_score: 0.8715


### Getting predictions for new unseen data.

In [11]:
import joblib

# Save the model
model_filename = "boosting_reg_model_best.pkl"
# joblib.dump(boosting_reg_best, model_filename)
print(f"Model saved as {model_filename}")

Model saved as boosting_reg_model_best.pkl


In [12]:
# Load the model
loaded_model = joblib.load("boosting_reg_model_best.pkl")

# Take user input (features) for prediction
user_input = {
    "age": float(input("Enter age: ")),
    "sex": int(input("Enter sex (0 for female, 1 for male): ")),
    "bmi": float(input("Enter BMI: ")),
    "children": int(input("Enter number of children: ")),
    "smoker": int(input("Enter smoker status (0 for no, 1 for yes): ")),
    "region": int(input("Enter region (0 for northeast, 1 for northwest, 2 for southeast, 3 for southwest): "))
}

# Convert user input to a DataFrame
user_df = pd.DataFrame([user_input])

# Normalize numerical columns

user_df['age'] = scaler_age.transform(user_df['age'].to_numpy().reshape(-1, 1))


user_df['bmi'] = scaler_bmi.transform(user_df['bmi'].to_numpy().reshape(-1, 1))


user_df['children'] = scaler_children.transform(user_df['children'].to_numpy().reshape(-1, 1))


# Make predictions
predicted_expenses = loaded_model.predict(user_df)
print(f"Predicted expenses: ${predicted_expenses[0]:.2f}")

Enter age: 19
Enter sex (0 for female, 1 for male): 0
Enter BMI: 27.9
Enter number of children: 0
Enter smoker status (0 for no, 1 for yes): 1
Enter region (0 for northeast, 1 for northwest, 2 for southeast, 3 for southwest): 3
Predicted expenses: $16753.32
