In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import joblib
import json

In [8]:
# --- Load data ---
df = pd.read_csv('insurance.csv')
X = df.drop(columns='charges')
y = df['charges']

# --- Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Preprocessing ---
categorical_features = ['sex', 'smoker', 'region']
numerical_features = ['age', 'bmi', 'children']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first'), categorical_features),
    ('num', StandardScaler(), numerical_features)
])

In [9]:
# --- Define pipelines and parameter grids ---
model_pipelines = {
    'Random Forest': (RandomForestRegressor(random_state=42), {
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10, 20]
    }),
    'Gradient Boosting': (GradientBoostingRegressor(random_state=42), {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.05, 0.1],
        'model__max_depth': [3, 4, 5]
    }),
    'XGBoost': (XGBRegressor(random_state=42), {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.05, 0.1],
        'model__max_depth': [3, 4, 5]
    }),
    'CatBoost': (CatBoostRegressor(verbose=0, random_state=42), {
        'model__iterations': [300, 500],
        'model__learning_rate': [0.05, 0.1],
        'model__depth': [4, 6]
    }),
    'SVR': (SVR(), {
        'model__C': [10, 100],
        'model__epsilon': [1, 10]
    }),
    'KNN': (KNeighborsRegressor(), {
        'model__n_neighbors': [3, 5, 7]
    })
}

# --- Perform Grid Search ---
best_models = {}
for name, (model, params) in model_pipelines.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    grid = GridSearchCV(pipe, param_grid=params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_models[name] = grid.best_estimator_
    print(f"{name} best params: {grid.best_params_}")
    print(f"{name} best score: {-grid.best_score_:.2f}")

Random Forest best params: {'model__max_depth': 10, 'model__n_estimators': 200}
Random Forest best score: 24423371.27
Gradient Boosting best params: {'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 100}
Gradient Boosting best score: 21578501.57
XGBoost best params: {'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 100}
XGBoost best score: 21407061.15
CatBoost best params: {'model__depth': 4, 'model__iterations': 300, 'model__learning_rate': 0.05}
CatBoost best score: 21543406.33
SVR best params: {'model__C': 100, 'model__epsilon': 1}
SVR best score: 147071212.90
KNN best params: {'model__n_neighbors': 3}
KNN best score: 43892184.34


In [10]:
# --- Performance Summary Function ---
def summarize_cv_performance(model, X, y):
    mae = -cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1).mean()
    rmse = np.sqrt(-cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1).mean())
    r2 = cross_val_score(model, X, y, cv=5, scoring='r2', n_jobs=-1).mean()
    return mae, rmse, r2

# --- Perform Grid Search and Summarize With Cross Validation---
best_models = {}
performance_summary = {}
for name, (model, params) in model_pipelines.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    grid = GridSearchCV(pipe, param_grid=params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_models[name] = grid.best_estimator_
    
    mae, rmse, r2 = summarize_cv_performance(grid.best_estimator_, X_train, y_train)
    print(f"\n{name} Best Parameters: {grid.best_params_}")
    print(f"{name} Cross-Validated MAE: {mae:.2f}")
    print(f"{name} Cross-Validated RMSE: {rmse:.2f}")
    print(f"{name} Cross-Validated R²: {r2:.4f}")
    
    


Random Forest Best Parameters: {'model__max_depth': 10, 'model__n_estimators': 200}
Random Forest Cross-Validated MAE: 2770.14
Random Forest Cross-Validated RMSE: 4942.00
Random Forest Cross-Validated R²: 0.8273

Gradient Boosting Best Parameters: {'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 100}
Gradient Boosting Cross-Validated MAE: 2636.92
Gradient Boosting Cross-Validated RMSE: 4645.27
Gradient Boosting Cross-Validated R²: 0.8472

XGBoost Best Parameters: {'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 100}
XGBoost Cross-Validated MAE: 2609.18
XGBoost Cross-Validated RMSE: 4626.78
XGBoost Cross-Validated R²: 0.8484

CatBoost Best Parameters: {'model__depth': 4, 'model__iterations': 300, 'model__learning_rate': 0.05}
CatBoost Cross-Validated MAE: 2616.37
CatBoost Cross-Validated RMSE: 4641.49
CatBoost Cross-Validated R²: 0.8475

SVR Best Parameters: {'model__C': 100, 'model__epsilon': 1}
SVR Cross-Validated MAE: 6484.53
S

In [12]:
# --- Train & Evaluate ---
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    print(f"{name} — Tuned Test RMSE: {rmse:.2f}, R²: {r2:.4f}")

    performance_summary[name] = {
        "rmse": round(rmse, 2),
        "r2": round(r2, 4)
    }

# 🔹 Save to JSON
with open("model_performance_summary.json", "w") as f:
    json.dump(performance_summary, f, indent=4)

print("✅ Model performance summary saved to 'model_performance_summary.json'")


Random Forest — Tuned Test RMSE: 4551.10, R²: 0.8666
Gradient Boosting — Tuned Test RMSE: 4294.46, R²: 0.8812
XGBoost — Tuned Test RMSE: 4236.27, R²: 0.8844
CatBoost — Tuned Test RMSE: 4248.40, R²: 0.8837
SVR — Tuned Test RMSE: 12112.94, R²: 0.0549
KNN — Tuned Test RMSE: 6648.87, R²: 0.7152
✅ Model performance summary saved to 'model_performance_summary.json'


In [14]:
# --- Train & Evaluate, Printing Best Params ---
trained_pipelines = {}

for name, model in best_models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    # Extract and print best model hyperparameters
    model_params = model.named_steps['model'].get_params()
    
    print(f"\n{name} — Tuned Test RMSE: {rmse:.2f}, R²: {r2:.4f}")
    print(f"{name} Best Parameters:")
    for param, val in model_params.items():
        print(f"  {param}: {val}")

    # Save the trained pipeline
    trained_pipelines[name] = model  # model IS the full fitted pipeline
    joblib.dump(model, f'{name.replace(" ", "_").lower()}_pipeline.pkl')




Random Forest — Tuned Test RMSE: 4551.10, R²: 0.8666
Random Forest Best Parameters:
  bootstrap: True
  ccp_alpha: 0.0
  criterion: squared_error
  max_depth: 10
  max_features: 1.0
  max_leaf_nodes: None
  max_samples: None
  min_impurity_decrease: 0.0
  min_samples_leaf: 1
  min_samples_split: 2
  min_weight_fraction_leaf: 0.0
  n_estimators: 200
  n_jobs: None
  oob_score: False
  random_state: 42
  verbose: 0
  warm_start: False

Gradient Boosting — Tuned Test RMSE: 4294.46, R²: 0.8812
Gradient Boosting Best Parameters:
  alpha: 0.9
  ccp_alpha: 0.0
  criterion: friedman_mse
  init: None
  learning_rate: 0.05
  loss: squared_error
  max_depth: 3
  max_features: None
  max_leaf_nodes: None
  min_impurity_decrease: 0.0
  min_samples_leaf: 1
  min_samples_split: 2
  min_weight_fraction_leaf: 0.0
  n_estimators: 100
  n_iter_no_change: None
  random_state: 42
  subsample: 1.0
  tol: 0.0001
  validation_fraction: 0.1
  verbose: 0
  warm_start: False

XGBoost — Tuned Test RMSE: 4236.27,

In [10]:
print(trained_pipelines)

{'Random Forest': Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(drop='first'),
                                                  ['sex', 'smoker', 'region']),
                                                 ('num', StandardScaler(),
                                                  ['age', 'bmi',
                                                   'children'])])),
                ('regressor',
                 Pipeline(steps=[('preprocessor',
                                  ColumnTransformer(transformers=[('cat',
                                                                   OneHotEncoder(drop='first'),
                                                                   ['sex',
                                                                    'smoker',
                                                                    'region']),
                                          