In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("data/insurance.csv")

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [4]:
cat_cols = ['sex', 'region', 'smoker']
num_cols = [x for x in df.columns if x not in cat_cols]

In [5]:
one_hot = pd.get_dummies(df[cat_cols])
one_hot_df = pd.concat([df[num_cols], one_hot], axis=1)
one_hot_df.head()

Unnamed: 0,age,bmi,children,expenses,sex_female,sex_male,region_northeast,region_northwest,region_southeast,region_southwest,smoker_no,smoker_yes
0,19,27.9,0,16884.92,True,False,False,False,False,True,False,True
1,18,33.8,1,1725.55,False,True,False,False,True,False,True,False
2,28,33.0,3,4449.46,False,True,False,False,True,False,True,False
3,33,22.7,0,21984.47,False,True,False,True,False,False,True,False
4,32,28.9,0,3866.86,False,True,False,True,False,False,True,False


In [None]:
label_encoder = LabelEncoder()
label_encod_df = df.copy()
for i in cat_cols:
    label_encod_df[i] = label_encoder.fit_transform(label_encod_df[i])
label_encod_df.head()    

In [10]:
X_oh = one_hot_df.drop(columns = 'expenses')
X_le = label_encod_df.drop(columns = 'expenses')
y = df.expenses

In [18]:
train_oh_X, test_oh_X, train_oh_y, test_oh_y = train_test_split(X_oh, y, test_size=0.25, random_state=1234)
train_le_X, test_le_X, train_le_y, test_le_y = train_test_split(X_le, y, test_size=0.25, random_state=1234)

In [12]:
models = {
    "Linear Regressor": LinearRegression(),
    "Support Vector Regressor": SVR(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "Random Foresst Regressor": RandomForestRegressor()
}

In [16]:
def evaluate_models(train_X, test_X, train_y, test_y):
    model_list = []
    scores = []

    for key, value in models.items():
        model = value
        model.fit(train_X, train_y)
        print(key)
        print("-------------------------------------")
        pred_train = model.predict(train_X)
        pred_test = model.predict(test_X)
        print("MAE")
        print(f"Train: {mean_absolute_error(train_y, pred_train)}")
        print(f"Test: {mean_absolute_error(test_y, pred_test)}")
        print("======================================")
        print("MSE")
        print(f"Train: {mean_squared_error(train_y, pred_train)}")
        print(f"Test: {mean_squared_error(test_y, pred_test)}")
        print("======================================")
        print("R2 Scores")
        print(f"Train: {r2_score(train_y, pred_train)}")
        print(f"Test: {r2_score(test_y, pred_test)}")
        print("--------------------------------------")
        model_list.append(value)
        scores.append(r2_score(test_y, pred_test))
        
    return model_list, scores

In [17]:
model_list_oh, scores_oh = evaluate_models(train_X=train_oh_X, 
                test_X=test_oh_X, 
                train_y=train_oh_y,
                test_y=test_oh_y)

Linear Regressor
-------------------------------------
MAE
Train: 4112.190101772367
Test: 4206.097478103472
MSE
Train: 36642823.24246682
Test: 36877242.556230456
R2 Scores
Train: 0.7525050535603803
Test: 0.7402826315056259
--------------------------------------
Support Vector Regressor
-------------------------------------
MAE
Train: 8338.481939881674
Test: 8252.653096647608
MSE
Train: 163708826.99581227
Test: 155979826.75064683
R2 Scores
Train: -0.10573104863995963
Test: -0.09852763747494575
--------------------------------------
Decision Tree Regressor
-------------------------------------
MAE
Train: 19.084127617148553
Test: 2743.3453731343284
MSE
Train: 182648.2693441675
Test: 36699033.76739284
R2 Scores
Train: 0.9987663471414442
Test: 0.7415377122673966
--------------------------------------
Random Foresst Regressor
-------------------------------------
MAE
Train: 1011.8298037786636
Test: 2572.521683044776
MSE
Train: 3534444.546942316
Test: 21867090.48487423
R2 Scores
Train: 0.9761

In [19]:
model_list_le, scores_le = evaluate_models(train_X=train_le_X,
                                          test_X=test_le_X,
                                          train_y=train_le_y,
                                          test_y=test_le_y)

Linear Regressor
-------------------------------------
MAE
Train: 4116.515609921461
Test: 4201.969751584186
MSE
Train: 36683753.97004317
Test: 36694053.69193585
R2 Scores
Train: 0.7522285970176568
Test: 0.7415727857165738
--------------------------------------
Support Vector Regressor
-------------------------------------
MAE
Train: 8333.689795787339
Test: 8247.328796326681
MSE
Train: 163717614.28751016
Test: 155953556.81803346
R2 Scores
Train: -0.10579040024269282
Test: -0.09834262478701317
--------------------------------------
Decision Tree Regressor
-------------------------------------
MAE
Train: 19.084127617148553
Test: 2832.1416716417907
MSE
Train: 182648.2693441675
Test: 37569045.237619705
R2 Scores
Train: 0.9987663471414442
Test: 0.735410435010625
--------------------------------------
Random Foresst Regressor
-------------------------------------
MAE
Train: 998.8373977068796
Test: 2631.724147164179
MSE
Train: 3565704.8458401957
Test: 22574958.550491903
R2 Scores
Train: 0.9759

__Linear Regressor__ has the most similar results for train and test data. And __Random Forest Regressor__ has a better accuracy for test data. We shall try to come up with some better parameters to increase accuracy of test data.

In [58]:
params = {"max_depth": range(2, 20, 2),
          "n_estimators": range(100, 400, 10),
          "max_features": ['auto', 'sqrt'],
          "min_samples_split": [2, 5, 10],
          "min_samples_leaf": [1, 2, 4]
}

In [59]:
from sklearn.model_selection import RandomizedSearchCV

In [60]:
rf = RandomForestRegressor()

In [61]:
rf_grid = RandomizedSearchCV(
    estimator = rf,
    param_distributions = params,
    n_jobs = -1,
    verbose = 2,
    random_state=123

)

In [62]:
rf_grid.fit(train_oh_X, train_oh_y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(estimator=RandomForestRegressor(), n_jobs=-1,
                   param_distributions={'max_depth': range(2, 20, 2),
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': range(100, 400, 10)},
                   random_state=123, verbose=2)

In [63]:
rf_grid.best_params_

{'n_estimators': 360,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 6}

In [64]:
rf_opt = RandomForestRegressor(n_estimators=360,
                              min_samples_leaf=4,
                              min_samples_split=10,
                              max_features='auto',
                              max_depth=6)

In [65]:
rf_opt.fit(train_le_X, train_le_y)

RandomForestRegressor(max_depth=6, min_samples_leaf=4, min_samples_split=10,
                      n_estimators=360)

In [66]:
preds = rf_opt.predict(test_le_X)

In [67]:
r2 = r2_score(test_le_y, preds)

In [68]:
r2

0.8604202345833606