# Insurance Cost Regression Study

## Import required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import statsmodels.api as sm
import copy
from copy import deepcopy
from pandas.plotting import scatter_matrix
from scipy.stats import ttest_ind

## Model - Random Forest

In [2]:
# Load the dataset
insurance_df = pd.read_csv("datasets/insurance.csv")

# Split data into 80:20 (train:test)
train_data, val_data = train_test_split(insurance_df, test_size=0.2, random_state=42)

# Save split datasets locally
train_data.to_csv("datasets/train_data.csv", index=False)
val_data.to_csv("datasets/val_data.csv", index=False)

In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Preprocessing: Encode categorical variables
insurance_df['sex'] = insurance_df['sex'].map({'male': 0, 'female': 1})
insurance_df['smoker'] = insurance_df['smoker'].map({'no': 0, 'yes': 1})
insurance_df = pd.get_dummies(insurance_df, columns=['region'], drop_first=True)

# Handle missing values (if any)
insurance_df['age'] = insurance_df['age'].fillna(insurance_df['age'].median())

# Split the dataset into features (X) and target (y)
X = insurance_df.drop('charges', axis=1)
y = insurance_df['charges']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Random Forest model
rf = RandomForestRegressor(random_state=42)

# Set up the parameter grid for fine-tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(
    estimator=rf, 
    param_grid=param_grid,
    cv=3, 
    n_jobs=-1, 
    scoring='neg_mean_squared_error', 
    verbose=0  # Suppress verbose output
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Train the Random Forest Regressor with the best parameters
best_rf = grid_search.best_estimator_

# Predict on the test set
y_pred = best_rf.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R^2 Score: {r2}")

# ---- Prediction for New Inputs ----

# Define a new input for prediction
new_input = pd.DataFrame({
    'age': [30],
    'sex': [1],  # 1 = female, 0 = male
    'bmi': [28.5],
    'children': [2],
    'smoker': [1],  # 1 = yes, 0 = no
    'region_northwest': [0],
    'region_southeast': [1],
    'region_southwest': [0]
})

# Ensure the input matches the feature structure
new_prediction = best_rf.predict(new_input)
print(f"Predicted Insurance Charge: {new_prediction[0]}")

Best Parameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Root Mean Squared Error (RMSE): 4383.584668804266
R^2 Score: 0.8762256260393726
Predicted Insurance Charge: 21421.72211734341


In [4]:
import joblib

# Assuming your trained model is called `best_rf`
joblib.dump(best_rf, 'insurance_model.pkl')

['insurance_model.pkl']