In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV,
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error

In [4]:
df = pd.read_csv('cleaned_insurance_data.csv')
df.drop(columns='Unnamed: 0',inplace=True)
df.sample(2)

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
1009,51,1,27.74,1,0,9957.7216,1.0,0.0,0.0,0.0
105,20,1,28.025,1,1,17560.37975,0.0,1.0,0.0,0.0


In [12]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'charges',
       'region_northeast', 'region_northwest', 'region_southeast',
       'region_southwest'],
      dtype='object')

In [5]:
df['charges'].min()

1121.8739

In [6]:
df['charges'].max()

63770.42801

# Preprocessing

- Scaling techniques are typically applied to numerical data in order to make the data comparable and reduce the impact of differences in scales or units. Categorical data does not have a scales

In [None]:
#def scaling_cols(df, cols):
    scaler = StandardScaler()
    df[cols] = scaler.fit_transform(df[cols])

In [None]:
scaling_cols(df, ['age','bmi','charges'])

In [None]:
df

### Model Building

- while fitting the model, some times it takes few minutes/hours, this will speed up.

- if you want to use default sklearn
    - from sklearnex import unpatch_sklearn
      unpatch_sklearn()

In [7]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [8]:
X = np.array(df[[col for col in df.columns if col!='charges']])
y = np.array(df['charges'])
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, random_state=42)

- I want to use `LinearRegression` which is good at capturing `linear relationship`, `Ridge` which deals with `multicolinearity` & `overfitting` and tree based models to capture `non-linear relationship`.
-  performing model building and tuning in a pipeline is a common approach to streamline the machine learning workflow. 

In [10]:
# Define the pipeline with the preprocessing and modeling steps
pipeline = Pipeline([
     # define preprocessor steps
    ('model', TransformedTargetRegressor(regressor=RandomForestRegressor())) # define initial model and target transformer
])

# Define the models to use
models = [
    LinearRegression(),
    Ridge(),
    RandomForestRegressor()
]

# Define the scoring metrics
scoring = {
    'mse': mean_squared_error,
    'mae': mean_absolute_error,
    'r2': r2_score
}

# Train and evaluate each model in the pipeline
for model in models:
    # Set the model in the pipeline
    pipeline.named_steps['model'].regressor = model
    
    # Train the model on the training data
    pipeline.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = pipeline.predict(X_val)
    
    # Calculate the evaluation metrics
    mse = scoring['mse'](y_val, y_pred)
    mae = scoring['mae'](y_val, y_pred)
    r2 = scoring['r2'](y_val, y_pred)
    
    # Print the evaluation metrics for the model
    print(f"Model: {type(model).__name__}")
    print(f"MSE: {mse}")
    print(f"MAE: {mae}")
    print(f"R2 Score: {r2}")
    print('\n')

Model: LinearRegression
MSE: 33596915.85136147
MAE: 4181.194473753642
R2 Score: 0.7835929767120723


Model: Ridge
MSE: 33642947.835070975
MAE: 4193.348192170582
R2 Score: 0.7832964719788793


Model: RandomForestRegressor
MSE: 21210326.240391213
MAE: 2494.7238521332715
R2 Score: 0.8633784248245846




- we can see that RF performed well, mae says on an average the predictions off by 2494 which is 

# Tuning

In [13]:
param_grid = {
    'model__regressor__n_estimators': [100, 200, 300],
    'model__regressor__max_depth': [None, 5, 10, 15],
    'model__regressor__min_samples_split': [2, 5, 10],
    'model__regressor__min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_estimator_)

{'model__regressor__max_depth': 5, 'model__regressor__min_samples_leaf': 4, 'model__regressor__min_samples_split': 10, 'model__regressor__n_estimators': 200}
Pipeline(steps=[('model',
                 TransformedTargetRegressor(regressor=RandomForestRegressor(max_depth=5,
                                                                            min_samples_leaf=4,
                                                                            min_samples_split=10,
                                                                            n_estimators=200)))])


In [14]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"R2 Score: {r2}")

MSE: 18938846.859077193
MAE: 2492.00455159118
R2 Score: 0.8780096514986292
