## **Notebook Objective**

This notebook:

* Trains multiple regression models
* Compares linear vs ensemble methods
* Evaluates performance using robust metrics
* Selects a champion model for interpretation

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.float_format", "{:.4f}".format)

In [None]:
import joblib

DATA_PATH = "../data/raw/insurance.csv"
PREPROCESSOR_PATH = "../data/processed/preprocessor.pkl"

df = pd.read_csv(DATA_PATH)
preprocessor = joblib.load(PREPROCESSOR_PATH)

In [None]:
numerical_features = [
    "age", "age_squared", "bmi", "children", "smoker_bmi_interaction"
]

categorical_features = [
    "sex", "region", "bmi_category"
]

X = df[numerical_features + categorical_features]
y = np.log1p(df["charges"])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
def evaluate_model(model, X_test, y_test):
    preds = model.predict(X_test)
    return {
        "MAE": mean_absolute_error(y_test, preds),
        "RMSE": np.sqrt(mean_squared_error(y_test, preds)),
        "R2": r2_score(y_test, preds)
    }

In [None]:
lr_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", LinearRegression())
    ]
)

lr_pipeline.fit(X_train, y_train)
lr_results = evaluate_model(lr_pipeline, X_test, y_test)
lr_results

In [None]:
ridge_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", Ridge(alpha=1.0))
    ]
)

ridge_pipeline.fit(X_train, y_train)
ridge_results = evaluate_model(ridge_pipeline, X_test, y_test)
ridge_results

In [None]:
lasso_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", Lasso(alpha=0.001))
    ]
)

lasso_pipeline.fit(X_train, y_train)
lasso_results = evaluate_model(lasso_pipeline, X_test, y_test)
lasso_results

In [None]:
rf_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor(
            n_estimators=300,
            max_depth=10,
            random_state=42
        ))
    ]
)

rf_pipeline.fit(X_train, y_train)
rf_results = evaluate_model(rf_pipeline, X_test, y_test)
rf_results

In [None]:
gb_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", GradientBoostingRegressor(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=3,
            random_state=42
        ))
    ]
)

gb_pipeline.fit(X_train, y_train)
gb_results = evaluate_model(gb_pipeline, X_test, y_test)
gb_results

In [None]:
results_df = pd.DataFrame.from_dict(
    {
        "Linear Regression": lr_results,
        "Ridge Regression": ridge_results,
        "Lasso Regression": lasso_results,
        "Random Forest": rf_results,
        "Gradient Boosting": gb_results
    },
    orient="index"
)

results_df.sort_values("RMSE")

In [None]:
best_model = gb_pipeline
preds = best_model.predict(X_test)

sns.scatterplot(x=preds, y=y_test - preds)
plt.axhline(0, linestyle="--", color="red")
plt.title("Residuals vs Predictions")
plt.show()

In [None]:
sns.histplot(y_test - preds, kde=True)
plt.title("Prediction Error Distribution")
plt.show()

In [None]:
predicted_charges = np.expm1(preds)
actual_charges = np.expm1(y_test)

pd.DataFrame({
    "Actual Charges": actual_charges[:10],
    "Predicted Charges": predicted_charges[:10]
})

In [None]:
model_summary = {
    "Best Performing Model": "Gradient Boosting Regressor",
    "Primary Metric": "RMSE (log-scale)",
    "Handles Nonlinearity": True,
    "Captures Interactions": True,
    "Deployment Ready": True
}

pd.DataFrame.from_dict(model_summary, orient="index", columns=["Value"])

## **Key Conclusions**

* Ensemble models significantly outperform linear baselines
* Log-transformed target improves stability
* Gradient Boosting offers the best biasâ€“variance tradeoff
* Model is suitable for pricing, segmentation, and forecasting