In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor

from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv("insurance.csv") # expense prediction dataset
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


## Splitting data into input and output features

In [8]:
x = data.drop("expenses",axis=1)
y = data["expenses"]

In [9]:
x

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.8,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.7,0,no,northwest
4,32,male,28.9,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest
1334,18,female,31.9,0,no,northeast
1335,18,female,36.9,0,no,southeast
1336,21,female,25.8,0,no,southwest


In [10]:
y

0       16884.92
1        1725.55
2        4449.46
3       21984.47
4        3866.86
          ...   
1333    10600.55
1334     2205.98
1335     1629.83
1336     2007.95
1337    29141.36
Name: expenses, Length: 1338, dtype: float64

## Data Transformation Pipeline

In [11]:
cat_columns = x.select_dtypes("object").columns
num_columns = x.select_dtypes(["int","float"]).columns
cat_columns

Index(['sex', 'smoker', 'region'], dtype='object')

In [12]:
num_columns

Index(['age', 'bmi', 'children'], dtype='object')

In [13]:
cat_pipeline = Pipeline([
    ("one_hot_encoding",OneHotEncoder()),
    ("standard_scaling",StandardScaler(with_mean=False)) # with_mean=False, means z = x/SD not, z=(x-u)/SD
                    # It is important when dealing with sparse matrices
])
num_pipeline = Pipeline([
    ("standard_scaling",StandardScaler())
])

preprocessor_pipeline = ColumnTransformer([
    ("one_hot_encoder",cat_pipeline,cat_columns),
    ("standard_scaler",num_pipeline,num_columns)
])

In [14]:
x_preprocessed = preprocessor_pipeline.fit_transform(x)

In [15]:
x_preprocessed.shape

(1338, 11)

## Model training

In [16]:
x_train,x_test,y_train,y_test = train_test_split(x_preprocessed,y,test_size=0.2,random_state=42)

In [17]:
# x_train
# x_test
# y_train
y_test

764      9095.07
887      5272.18
890     29330.98
1293     9301.89
259     33750.29
          ...   
109     47055.53
575     12222.90
535      6067.13
543     63770.43
846      9872.70
Name: expenses, Length: 268, dtype: float64

In [18]:
models = {
    "Linear":LinearRegression(),
    "Decision_Tree":DecisionTreeRegressor(),
    "Random_Forest":RandomForestRegressor(),
    "GradientBoost":GradientBoostingRegressor(),
    "AdaBoost":AdaBoostRegressor(),
    "XGBoost":XGBRegressor()  
}
params = {
    "Decision_Tree": {
        "criterion": [
            "squared_error",
            "friedman_mse",
            "absolute_error",
            "poisson",
        ],
    },
    "Random_Forest": {
        "criterion": [
            "squared_error",
            "friedman_mse",
            "absolute_error",
            "poisson",
        ],
        "n_estimators": [8, 16, 32, 64, 128, 256],
    },
    "GradientBoost": {
        "learning_rate": [0.1, 0.01, 0.05, 0.001],
        # "subsample": [0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
        # "criterion": [
        #     "squared_error",
        #     "friedman_mse",
        # ],
        "n_estimators": [8, 16, 32, 64, 128, 256],
    },
    "Linear": {},
    "XGBoost": {
        "learning_rate": [0.1, 0.01, 0.05, 0.001],
        "n_estimators": [8, 16, 32, 64, 128, 256],
    },
    "AdaBoost": {
        "learning_rate": [0.1, 0.01, 0.5, 0.001],
        "n_estimators": [8, 16, 32, 64, 128, 256],
    },
}


In [76]:
def evaluate_models(x_train,y_train,x_test,y_test,models,params):
    report = {}
    for k,v in models.items():
        # k = model name, v = model obj
        par = params[k]
        # print(par)
        gs = GridSearchCV(v,par,cv=3)
        gs.fit(x_train,y_train)
        # print(gs.best_params_)
        
        v.set_params(**gs.best_params_)
        model = v.fit(x_train,y_train)
        # print(model.get_params())
        
        y_test_pred = model.predict(x_test)
        r2score = r2_score(y_test,y_test_pred)
        report[r2score] = [k,model] 
        
    return report # report = dict having model name as key and [model obj,its accuracy] as value

In [77]:
report = evaluate_models(x_train=x_train,y_train=y_train,x_test=x_test,y_test=y_test,models=models,params=params)
report

{0.7835726930039901: ['Linear', LinearRegression()],
 0.7228836135959156: ['Decision_Tree', DecisionTreeRegressor()],
 0.8612051780213935: ['Random_Forest',
  RandomForestRegressor(criterion='poisson', n_estimators=256)],
 0.8758109683980995: ['GradientBoost',
  GradientBoostingRegressor(n_estimators=32)],
 0.8575907107350864: ['AdaBoost',
  AdaBoostRegressor(learning_rate=0.001, n_estimators=16)],
 0.8696148057880264: ['XGBoost',
  XGBRegressor(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               feature_weights=None, gamma=None, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=0.1, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
 

In [78]:
# takes report and returns the name and obj of the model having best r2score
def find_best_model(report):
    max_acc = max(report.keys())
    best_model_name,best_model_obj = report[max_acc]
    return (best_model_name,best_model_obj,max_acc)

In [79]:
best_model_name,best_model_obj,acc = find_best_model(report=report)

In [80]:
best_model_name,best_model_obj,acc

('GradientBoost',
 GradientBoostingRegressor(n_estimators=32),
 0.8758109683980995)

In [81]:
# just cross checking the above functions
r2_score(y_test,best_model_obj.predict(x_test))

0.8758109683980995

In [56]:
# dtr = DecisionTreeRegressor()
# gs = GridSearchCV(dtr,params['Decision_Tree'],cv=3)
# gs.fit(x_train,y_train)

In [57]:
# gs.best_params_

In [58]:
# dtr.set_params(**gs.best_params_)
# mod = dtr.fit(x_train,y_train)

In [59]:
# mod