# **Change Working Directory**

In [1]:
import os
from pathlib import Path

project_root = Path('..').resolve()
os.chdir(project_root)

print('Current working directory:', os.getcwd())

Current working directory: C:\Users\ibnum\Desktop\Material Learning\Portfolio Bagas\ML\Insurance Charges


# **Import Modules**

In [2]:
from src.config import config
from src.data import split_data, load_data
from src.preprocessing import preprocesssing
from src.model import build_model, evaluate_model, save_load
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

# **Load Cleaned Data**

In [3]:
df = load_data.read_data(file_path=config.CLEAN_DATA).drop(columns=['Unnamed: 0'])
df.head()

Data loaded succesfully from data/processed/insurance_cleaned.csv!
Data shape: (1337, 8)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# **Preprocessing**

In [4]:
# build pipeline object
num_pipe, cat_pipe = preprocesssing.build_pipeline()

# build preprocessing object
preprocessor = preprocesssing.build_preprocessing(num_pipe=num_pipe, cat_pipe=cat_pipe)

In [5]:
# split train test
X_train, X_test, y_train, y_test = split_data.split_train_test(data=df)

X_train shape: (1069, 6)
X_test shape: (268, 6)
y_train shape: (1069,)
y_test shape: (268,)


# **Modeling**

## 1. Baseline Model

In [6]:
y_base_pred = build_model.build_baseline(X_train=X_train, y_train=y_train)
rmse_base = evaluate_model.evaluate_baseline(y_train=y_train, y_pred=y_base_pred)

## 2. Predict & Evaluate Models (CV-Train)

### 2.1 K-Nearest Neighbors

In [7]:
knn_cv = build_model.build_cv_train(
    estimator=KNeighborsRegressor(),
    preprocessor=preprocessor,
    params=config.KNN_PARAMS,
    X_train=X_train,
    y_train=y_train
)

knn_best_param, knn_best_model, knn_rmse_cv, knn_rmse_train = evaluate_model.evaluate_cv_train(
    estimator=knn_cv,
    X_train=X_train,
    y_train=y_train
)

Model KNeighborsRegressor has been created succesfully, time elapsed: 0.12 minutes.


### 2.2 Linear Regression

In [8]:
lr_cv = build_model.build_cv_train(
    estimator=LinearRegression(),
    params={},
    preprocessor=preprocessor,
    X_train=X_train,
    y_train=y_train
)

lr_best_param, lr_best_model, lr_rmse_cv, lr_rmse_train = evaluate_model.evaluate_cv_train(
    estimator=lr_cv,
    X_train=X_train,
    y_train=y_train
)

Model LinearRegression has been created succesfully, time elapsed: 0.0 minutes.


### 2.3 Lasso

In [9]:
lasso_cv = build_model.build_cv_train(
    estimator=Lasso(),
    params=config.LR_PARAMS,
    preprocessor=preprocessor,
    X_train=X_train,
    y_train=y_train
)

lasso_best_param, lasso_best_model, lasso_rmse_cv, lasso_rmse_train = evaluate_model.evaluate_cv_train(
    estimator=lasso_cv,
    X_train=X_train,
     y_train=y_train
)

Model Lasso has been created succesfully, time elapsed: 0.01 minutes.


### 2.4 Ridge

In [10]:
ridge_cv = build_model.build_cv_train(
    estimator=Ridge(),
    params=config.LR_PARAMS,
    preprocessor=preprocessor,
    X_train=X_train,
    y_train=y_train
)

ridge_best_param, ridge_best_model, ridge_rmse_cv, ridge_rmse_train = evaluate_model.evaluate_cv_train(
    estimator=ridge_cv,
    X_train=X_train,
    y_train=y_train
)

Model Ridge has been created succesfully, time elapsed: 0.01 minutes.


### 2.5 Elastic Net

In [11]:
elastic_cv = build_model.build_cv_train(
    estimator=ElasticNet(),
    params=config.LR_PARAMS,
    preprocessor=preprocessor,
    X_train=X_train,
    y_train=y_train
)

elastic_best_param, elastic_best_model, elastic_rmse_cv, elastic_rmse_train = evaluate_model.evaluate_cv_train(
    estimator=elastic_cv,
    X_train=X_train,
    y_train=y_train
)

Model ElasticNet has been created succesfully, time elapsed: 0.01 minutes.


### 2.6 Random Forest

In [12]:
rf_cv = build_model.build_cv_train(
    estimator=RandomForestRegressor(random_state=config.RANDOM_STATE),
    params=config.RF_PARAMS,
    preprocessor=preprocessor,
    X_train=X_train,
    y_train=y_train
)

rf_best_param, rf_best_model, rf_rmse_cv, rf_rmse_train = evaluate_model.evaluate_cv_train(
    estimator=rf_cv,
    X_train=X_train,
    y_train=y_train
)

Model RandomForestRegressor has been created succesfully, time elapsed: 0.9 minutes.


### 2.7 XGBoost

In [13]:
xgb_cv = build_model.build_cv_train(
    estimator=XGBRegressor(random_state=config.RANDOM_STATE, verbose=config.VERBOSE),
    params=config.XGBOOST_PARAMS,
    preprocessor=preprocessor,
    X_train=X_train,
    y_train=y_train
)

xgb_best_param, xgb_best_model, xgb_rmse_cv, xgb_rmse_train = evaluate_model.evaluate_cv_train(
    estimator=xgb_cv,
    X_train=X_train,
    y_train=y_train
)

Model XGBRegressor has been created succesfully, time elapsed: 0.84 minutes.


### 2.8 CatBoost

In [14]:
cb_cv = build_model.build_cv_train(
    estimator=CatBoostRegressor(random_state=config.RANDOM_STATE, verbose=config.VERBOSE),
    params=config.CATBOOST_PARAMS,
    preprocessor=preprocessor,
    X_train=X_train,
    y_train=y_train
)

cb_best_param, cb_best_model, cb_rmse_cv, cb_rmse_train = evaluate_model.evaluate_cv_train(
    estimator=cb_cv,
    X_train=X_train,
    y_train=y_train
)

Model CatBoostRegressor has been created succesfully, time elapsed: 2.14 minutes.


### Summary CV-Train Results

In [15]:
summ_cv_train = pd.DataFrame(data={
    'models': ['Baseline', 'KNN', 'Linear Regression', 'Lasso', 'Ridge', 'ElasticNet', 'Random Forest', 'XGBoost', 'CatBoost'],
    'rmse_cv': [rmse_base, knn_rmse_cv, lr_rmse_cv, lasso_rmse_cv, ridge_rmse_cv, elastic_rmse_cv, rf_rmse_cv, xgb_rmse_cv, cb_rmse_cv],
    'rmse_train': [rmse_base, knn_rmse_train, lr_rmse_train, lasso_rmse_train, ridge_rmse_train, elastic_rmse_train, rf_rmse_train, xgb_rmse_train, cb_rmse_train],
    'best_params': ['-', knn_best_param, lr_best_param, lasso_best_param, ridge_best_param, elastic_best_param, rf_best_param, xgb_best_param, cb_best_param]
})

summ_cv_train.sort_values(by='rmse_train', ascending=True).reset_index(drop=True)

Unnamed: 0,models,rmse_cv,rmse_train,best_params
0,Random Forest,4688.169716,3245.741203,"{'model__n_estimators': 500, 'model__min_sampl..."
1,XGBoost,4491.848182,4089.685408,"{'model__reg_lambda': 0.25, 'model__reg_alpha'..."
2,CatBoost,4498.303368,4156.181798,"{'model__learning_rate': 0.01, 'model__iterati..."
3,KNN,6234.007642,5357.701789,{'model__n_neighbors': 9}
4,Linear Regression,6147.224085,6071.969165,{}
5,Ridge,6146.84582,6072.100706,{'model__alpha': 1.438449888287663}
6,ElasticNet,6146.815162,6072.300483,{'model__alpha': 0.004281332398719396}
7,Lasso,6131.77473,6092.86623,{'model__alpha': 112.88378916846884}
8,Baseline,12020.63421,12020.63421,-


As shown from the summary above, all of the models performed better than the baseline. It's clear that all of the ensemble models are working better than the rest of models. Having seen that, I decide to choose my top 3 models to predict and evaluate using Test Data, which are Random Forest, XGBoost, and CatBoost. Let's see how these models could perform on Test Data

## 3. Predict & Evaluate Models (Test Data)

In [16]:
# random forest
rf_y_pred = build_model.build_test(
    estimator=rf_best_model,
    X_test=X_test
)

rf_rmse_test = evaluate_model.evaluate_test(
    y_test=y_test,
    y_pred=rf_y_pred
)

# XGBoost
xgb_y_pred = build_model.build_test(
    estimator=xgb_best_model,
    X_test=X_test
)

xgb_rmse_test = evaluate_model.evaluate_test(
    y_test=y_test,
    y_pred=xgb_y_pred
)

# CatBoost
cb_y_pred = build_model.build_test(
    estimator=cb_best_model,
    X_test=X_test
)

cb_rmse_test = evaluate_model.evaluate_test(
    y_test=y_test,
    y_pred=cb_y_pred
)

Model RandomForestRegressor(max_depth=10, max_features='sqrt', min_samples_leaf=2,
                      n_estimators=500, random_state=123) has been created succesfully, time elapsed: 0.0 minutes.
Model XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.01, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=500,
             n_jobs=None, num_parallel_tree=None, ...) has been created succesfully, time elapsed: 0.0 mi

In [17]:
summary_train_test = pd.DataFrame(data={
    'models': ['Random Forest', 'XGBoost', 'CatBoost'],
    'rmse_train': [rf_rmse_train, xgb_rmse_train, cb_rmse_train],
    'rmse_test': [rf_rmse_test, xgb_rmse_test, cb_rmse_test],
    'gap train-test': [abs(rf_rmse_train-rf_rmse_test), abs(xgb_rmse_train-xgb_rmse_test), abs(cb_rmse_train-cb_rmse_test)]
})

summary_train_test.sort_values(by='rmse_test', ascending=True).reset_index(drop=True)

Unnamed: 0,models,rmse_train,rmse_test,gap train-test
0,XGBoost,4089.685408,4431.852193,342.166785
1,CatBoost,4156.181798,4433.272793,277.090995
2,Random Forest,3245.741203,4685.117715,1439.376512


I consider CatBoost as the best model towards this case, because it has the least rmse in test data. It also has a least gap between train and test score that indicates this model is working decent in generalizing the unseen data. Meanwhile, RandomForest is indicated as an overfitting model.

# Prediction Overview

In [36]:
# X_test to df
X_test_df = pd.DataFrame(
    data=X_test,
    columns=X_test.columns,
    index=X_test.index
).reset_index(drop=True)

# y_test and y_pred to series
y_test_series = pd.Series(y_test).reset_index(drop=True)
y_pred_test_series = pd.Series(cb_y_pred).reset_index(drop=True)

# concat all X_test, y_test, and y_pred
prediction_df = pd.concat([X_test_df, y_test_series, y_pred_test_series], axis=1).rename(columns={0: 'charges_pred'})
prediction_df[:21]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,charges_pred
0,23,male,17.385,1,no,northwest,2775.19215,3812.027915
1,25,female,28.595,0,no,northeast,3213.62205,4771.968725
2,25,male,24.13,0,yes,northwest,15817.9857,17775.483495
3,21,female,25.8,0,no,southwest,2007.945,3108.952513
4,32,male,30.8,3,no,southwest,5253.524,5725.487053
5,32,female,33.155,3,no,northwest,6128.79745,7090.234097
6,42,female,32.87,0,no,northeast,7050.0213,8308.331007
7,48,male,29.6,0,no,southwest,21232.18226,9462.389622
8,35,male,24.13,1,no,northwest,5125.2157,6432.096699
9,41,male,30.78,3,yes,northeast,39597.4072,36841.458947


# Save Model

In [None]:
# save catboost best model
save_load.save_object(obj=cb_best_model, path=config.MODEL_PATH)

Saving object. . . .
Your object has been saved succesfully and stored into: artifacts/model/best_model.pkl
