# Model Training and Evaluation Notebook

In [11]:
# Add the project root directory to the system path to allow importing modules from it
# Useful when running scripts from subdirectories or notebooks
import sys
import os
sys.path.append(os.path.abspath(r'D:\programming\AI\usif elshafie\Git hub\insurance-prediction'))

In [15]:
# -----------------------------------------------
# 🧩 Import libraries 
# -----------------------------------------------
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib
from src.preprocessing import load_data, encode_data, split_by_smoker

In [57]:
# -----------------------------------------------
# ⚙️ Training and evaluation function
# -----------------------------------------------
def train_and_evaluate(df, model_name="Model"):
    """
    Train a Linear Regression model on the given dataframe and print evaluation metrics.
    - Splits data into features and target
    - Splits into train/test sets
    - Applies scaling to numeric features
    - Trains a linear model
    - Evaluates performance (MAE, RMSE, R2)
    """
    # Split into X and y
    X = df.drop('charges', axis=1)
    y = df['charges']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale only numeric columns
    numeric_cols = ['age', 'bmi', 'children']
    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

    # Train model
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)

    # Predict
    y_pred = model.predict(X_test_scaled)

    # Evaluation
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f"\n🔹 Results for {model_name}:")
    print(f"   MAE  = {mae:.2f}")
    print(f"   RMSE = {rmse:.2f}")
    print(f"   R²   = {r2:.4f}")

    return model, scaler


In [45]:
# -----------------------------------------------
# 🚀 Load and prepare the dataset
# -----------------------------------------------
df = load_data("../data/insurance.csv")
df = encode_data(df)


In [47]:
# Split based on smoker status
smoker_df, nonsmoker_df = split_by_smoker(df)


In [69]:
# -----------------------------------------------
# 🧪 Train and evaluate models
# -----------------------------------------------
model_all, scaler_all = train_and_evaluate(df, "All Data")
model_smoker, scaler_smoker = train_and_evaluate(smoker_df, "Smokers")
model_nonsmoker, scaler_nonsmoker = train_and_evaluate(nonsmoker_df, "Non-Smokers")




🔹 Results for All Data:
   MAE  = 4181.19
   RMSE = 5796.28
   R²   = 0.7836

🔹 Results for Smokers:
   MAE  = 4774.73
   RMSE = 6697.87
   R²   = 0.7010

🔹 Results for Non-Smokers:
   MAE  = 2422.00
   RMSE = 4363.45
   R²   = 0.4577


In [75]:
# Create output directory
os.makedirs("../models", exist_ok=True)

# Save all models and scalers
joblib.dump(model_all, "../models/model_all.pkl")
joblib.dump(scaler_all, "../models/scaler_all.pkl")

joblib.dump(model_smoker, "../models/model_smokers.pkl")
joblib.dump(scaler_smoker, "../models/scaler_smokers.pkl")

joblib.dump(model_nonsmoker, "../models/model_nonsmokers.pkl")
joblib.dump(scaler_nonsmoker, "../models/scaler_nonsmokers.pkl")


['../models/scaler_nonsmokers.pkl']