In [7]:
import optuna
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
import pandas as pd
import numpy as np


In [8]:
df = pd.read_csv(r"C:\Users\aarya\OneDrive\Desktop\College stuff\VIT\SY\S4\Data Science\CP\insurance.csv")

df['charges'] = np.log1p(df['charges'])  # log target
df['bmi_over_30'] = (df['bmi'] > 30).astype(int)
df['age_bmi_interaction'] = df['age'] * df['bmi']
df['smoker_age_interaction'] = df['age'] * (df['smoker'] == 'yes').astype(int)
df['children_over_2'] = (df['children'] > 2).astype(int)

X = df.drop('charges', axis=1)
y = df['charges']

numeric_features = ['age', 'bmi', 'children', 'age_bmi_interaction', 'smoker_age_interaction']
categorical_features = ['sex', 'smoker', 'region']
binary_features = ['bmi_over_30', 'children_over_2']

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(drop='first'), categorical_features),
    ('bin', 'passthrough', binary_features)
])


In [9]:
# Define columns
numeric_features = ['age', 'bmi', 'children', 'age_bmi_interaction', 'smoker_age_interaction']
categorical_features = ['sex', 'smoker', 'region']
binary_features = ['bmi_over_30', 'children_over_2']

# Column transformer
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(drop='first'), categorical_features),
    ('bin', 'passthrough', binary_features)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
def objective_rf(trial):
    model = RandomForestRegressor(
        n_estimators=trial.suggest_int('n_estimators', 100, 300),
        max_depth=trial.suggest_int('max_depth', 3, 15),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 5),
        max_features=trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        random_state=42
    )
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    score = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error').mean()
    return -score

study_rf = optuna.create_study(direction='minimize')
study_rf.optimize(objective_rf, n_trials=30)
print("Best RF Params:", study_rf.best_params)


[I 2025-04-10 15:13:23,105] A new study created in memory with name: no-name-efe72a35-dee1-4d3e-9479-75f834f4561e


[I 2025-04-10 15:13:24,654] Trial 0 finished with value: 0.2196499580394497 and parameters: {'n_estimators': 184, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.2196499580394497.
[I 2025-04-10 15:13:27,260] Trial 1 finished with value: 0.2023736347677491 and parameters: {'n_estimators': 253, 'max_depth': 14, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 1 with value: 0.2023736347677491.
[I 2025-04-10 15:13:29,483] Trial 2 finished with value: 0.20833447921579779 and parameters: {'n_estimators': 269, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 1 with value: 0.2023736347677491.
[I 2025-04-10 15:13:31,425] Trial 3 finished with value: 0.2016606003360429 and parameters: {'n_estimators': 156, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 3 with value: 0.201660600336042

Best RF Params: {'n_estimators': 240, 'max_depth': 14, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 'sqrt'}


In [12]:
def objective_gbr(trial):
    model = GradientBoostingRegressor(
        n_estimators=trial.suggest_int('n_estimators', 100, 300),
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.2),
        max_depth=trial.suggest_int('max_depth', 3, 10),
        subsample=trial.suggest_float('subsample', 0.7, 1.0),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
        random_state=42
    )
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    score = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error').mean()
    return -score

study_gbr = optuna.create_study(direction='minimize')
study_gbr.optimize(objective_gbr, n_trials=30)
print("Best GBR Params:", study_gbr.best_params)


[I 2025-04-10 15:15:48,105] A new study created in memory with name: no-name-2d756dd6-bea4-4f63-a291-018594975244
[I 2025-04-10 15:15:51,548] Trial 0 finished with value: 0.22861038898592811 and parameters: {'n_estimators': 210, 'learning_rate': 0.13786710764990923, 'max_depth': 7, 'subsample': 0.8436986236849565, 'min_samples_split': 4}. Best is trial 0 with value: 0.22861038898592811.
[I 2025-04-10 15:15:56,590] Trial 1 finished with value: 0.23953224450683358 and parameters: {'n_estimators': 294, 'learning_rate': 0.15849716061463578, 'max_depth': 8, 'subsample': 0.7871396659168188, 'min_samples_split': 6}. Best is trial 0 with value: 0.22861038898592811.
[I 2025-04-10 15:15:59,958] Trial 2 finished with value: 0.2243166270625075 and parameters: {'n_estimators': 213, 'learning_rate': 0.08236003118570509, 'max_depth': 8, 'subsample': 0.7050612498739229, 'min_samples_split': 8}. Best is trial 2 with value: 0.2243166270625075.
[I 2025-04-10 15:16:03,188] Trial 3 finished with value: 0.2

Best GBR Params: {'n_estimators': 237, 'learning_rate': 0.02998569539846014, 'max_depth': 3, 'subsample': 0.7572133914606762, 'min_samples_split': 7}


In [13]:
def objective_xgb(trial):
    model = XGBRegressor(
        objective='reg:squarederror',
        n_estimators=trial.suggest_int('n_estimators', 100, 300),
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.2),
        max_depth=trial.suggest_int('max_depth', 3, 10),
        subsample=trial.suggest_float('subsample', 0.7, 1.0),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.7, 1.0),
        random_state=42
    )
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    score = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error').mean()
    return -score

study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective_xgb, n_trials=30)
print("Best XGB Params:", study_xgb.best_params)


[I 2025-04-10 15:18:04,767] A new study created in memory with name: no-name-35ede6fe-991f-488c-b0ed-dd62045ec85e
[I 2025-04-10 15:18:07,265] Trial 0 finished with value: 0.24363285096999604 and parameters: {'n_estimators': 300, 'learning_rate': 0.1290723635772775, 'max_depth': 9, 'subsample': 0.8853262311950173, 'colsample_bytree': 0.8205174670171949}. Best is trial 0 with value: 0.24363285096999604.
[I 2025-04-10 15:18:07,662] Trial 1 finished with value: 0.19439534499438024 and parameters: {'n_estimators': 137, 'learning_rate': 0.055553216920638304, 'max_depth': 3, 'subsample': 0.9645983856051954, 'colsample_bytree': 0.9277748836443069}. Best is trial 1 with value: 0.19439534499438024.
[I 2025-04-10 15:18:08,214] Trial 2 finished with value: 0.19910204029204534 and parameters: {'n_estimators': 254, 'learning_rate': 0.05766736883077769, 'max_depth': 3, 'subsample': 0.9857120957428618, 'colsample_bytree': 0.8133055440617685}. Best is trial 1 with value: 0.19439534499438024.
[I 2025-04

Best XGB Params: {'n_estimators': 128, 'learning_rate': 0.04075690635874908, 'max_depth': 4, 'subsample': 0.7831770079912321, 'colsample_bytree': 0.9501874235117532}


In [14]:
rf = RandomForestRegressor(
    n_estimators=240,
    max_depth=14,
    min_samples_split=3,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42
)

gbr = GradientBoostingRegressor(
    n_estimators=237,
    learning_rate=0.02998569539846014,
    max_depth=3,
    subsample=0.7572133914606762,
    min_samples_split=7,
    random_state=42
)

xgb_model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=128,
    learning_rate=0.04075690635874908,
    max_depth=4,
    subsample=0.7831770079912321,
    colsample_bytree=0.9501874235117532,
    random_state=42
)

In [15]:
meta_model = GradientBoostingRegressor(
    n_estimators=50, learning_rate=0.1, max_depth=2, random_state=42
)

In [16]:
stacked_reg = StackingRegressor(
    estimators=[('rf', rf), ('gbr', gbr), ('xgb', xgb_model)],
    final_estimator=meta_model,
    passthrough=False,
    n_jobs=-1
)

In [17]:
final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', stacked_reg)
])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
final_pipeline.fit(X_train, y_train)

In [20]:
y_pred_log = final_pipeline.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_test_actual = np.expm1(y_test)

In [21]:
mae = mean_absolute_error(y_test_actual, y_pred)
mse = mean_squared_error(y_test_actual, y_pred)
r2 = r2_score(y_test_actual, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

Mean Absolute Error (MAE): 1996.09
Mean Squared Error (MSE): 19154794.71
R-squared (R²): 0.88


In [22]:
import joblib

joblib.dump(final_pipeline, 'insurance_model.pkl')


['insurance_model.pkl']