In [1]:
# train_models.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
import joblib
import warnings

warnings.filterwarnings('ignore')

In [2]:
# --- 1. Load and Preprocess Data ---
# This section is adapted from your notebook
df = pd.read_csv('insurance_processed.csv')

# Apply log transformation to the target variable
df['expenses'] = np.log1p(df['expenses'])


In [3]:
# One-Hot Encoding for categorical features
df_encoded = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)

# Drop the 'bmi_category' as it was for EDA
df_encoded = df_encoded.drop('bmi_category', axis=1)

# Define features (X) and target (y)
X = df_encoded.drop('expenses', axis=1)
y = df_encoded['expenses']

# Split the data (we'll train on the full dataset for the app)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data loaded and preprocessed.")

Data loaded and preprocessed.


In [7]:
# --- 2. Train and Save Models ---

# Model 1: Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
joblib.dump(lr_model, 'linear_regression_model.pkl')
print("Linear Regression model trained and saved.")

# Model 2: Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
joblib.dump(ridge_model, 'ridge_regression_model.pkl')
print("Ridge Regression model trained and saved.")

# Model 3: Gradient Boosting Regressor
gbr_model = GradientBoostingRegressor(random_state=42)
gbr_model.fit(X_train, y_train)
joblib.dump(gbr_model, 'gradient_boosting_model.pkl')
print("Gradient Boosting model trained and saved.")

# Model 4: Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
joblib.dump(rf_model, 'random_forest_model.pkl')
print("Random Forest model trained and saved.")



Linear Regression model trained and saved.
Ridge Regression model trained and saved.
Gradient Boosting model trained and saved.
Random Forest model trained and saved.


In [9]:
# Save the column order for the app to use
joblib.dump(X.columns, 'model_columns.pkl')
print("Model columns saved.")

print("\nAll models have been trained and saved successfully!")

Model columns saved.

All models have been trained and saved successfully!
