In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import catboost as cat
import numpy as np
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

In [8]:
df = pd.read_csv("../../datasets/insurance_dataset/insurance.csv")

target = "charges"
num_features = list(df.select_dtypes(exclude='object'))
num_features.remove(target)
cat_features = list(df.select_dtypes(include='object'))

In [5]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

# Dataset preprocessing

In [11]:
df.loc[:, num_features] = df.loc[:, num_features].fillna(0)
df.loc[:, cat_features] = df.loc[:, cat_features].fillna('Other')

In [12]:
enc = OneHotEncoder(sparse_output=False, drop='first')
encoded_cat_df = enc.fit_transform(df[cat_features])
encoded_cat_df = pd.DataFrame(encoded_cat_df, columns=enc.get_feature_names_out(cat_features)).astype('Int16')
encoded_cat_features = list(encoded_cat_df.columns)
encoded_cat_df.shape

(1338, 5)

In [13]:
encoded_df = pd.concat([df, encoded_cat_df], axis=1)

In [15]:
encoded_df['class'] = encoded_df[target].apply(lambda x: 1 if x >= 15_000 else 0)
clf_target = 'class'
encoded_df['class'].value_counts()

class
0    980
1    358
Name: count, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    encoded_df[num_features+cat_features+encoded_cat_features], 
    encoded_df[[target, clf_target]], 
    test_size=0.33, random_state=2025)

df_train = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
df_test = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)

In [17]:
linear_features = num_features+encoded_cat_features

In [18]:
scaler = MinMaxScaler()
scaled_df_train = scaler.fit_transform(df_train[linear_features])
scaled_df_train = pd.DataFrame(scaled_df_train, columns=linear_features)
scaled_df_train = pd.concat([scaled_df_train, df_train[[target, clf_target]]], axis=1)

scaled_df_test = scaler.transform(df_test[linear_features])
scaled_df_test = pd.DataFrame(scaled_df_test, columns=linear_features)
scaled_df_test = pd.concat([scaled_df_test, df_test[[target, clf_target]]], axis=1)

# Simple Linear Regression

In [19]:
result_metrics = pd.DataFrame(columns=[
     'algorithm',
     'dataset_type',
     'R2',
     'MSE',
     'MAE',
     'MAPE'
])

In [20]:
def get_metrics(y_true, y_pred, algorithm, dataset_type, res_df=None):
    res_df = res_df.copy()
    
    r2 = metrics.r2_score(y_true, y_pred)
    mse = metrics.mean_squared_error(y_true, y_pred)
    mae = metrics.mean_absolute_error(y_true, y_pred)
    mape = metrics.mean_absolute_percentage_error(y_true, y_pred)

    print(f"{algorithm}, -- Type Dataset - {dataset_type}")
    print(f"R2 = {r2}")
    print(f"MSE = {mse}")
    print(f"MAE = {mae}")
    print(f"MAPE = {mape}")
    
    curr_res = [algorithm, dataset_type] + [r2, mse, mae, mape]
    res_df = pd.concat([res_df,
                        pd.DataFrame([curr_res], columns=['algorithm',
                                                            'dataset_type',
                                                            'R2',
                                                            'MSE',
                                                            'MAE',
                                                            'MAPE'])])
    return res_df

In [21]:
simple_model = LinearRegression()
simple_model.fit(scaled_df_train[linear_features], scaled_df_train[target])

In [22]:
preds_train = simple_model.predict(scaled_df_train[linear_features])
preds_test = simple_model.predict(scaled_df_test[linear_features])

In [23]:
result_metrics = get_metrics(scaled_df_train[target], preds_train, 
                              algorithm='simple_regression', dataset_type='train', 
                              res_df=result_metrics)

simple_regression, -- Type Dataset - train
R2 = 0.7596803105755882
MSE = 34809575.141775966
MAE = 4042.614403249624
MAPE = 0.397542424375605


  res_df = pd.concat([res_df,


In [24]:
result_metrics = get_metrics(scaled_df_test[target], preds_test, 
                              algorithm='simple_regression', dataset_type='test', 
                              res_df=result_metrics)

simple_regression, -- Type Dataset - test
R2 = 0.7309284587016552
MSE = 40273908.54301847
MAE = 4365.496446587046
MAPE = 0.4161397740861992


In [25]:
result_metrics

Unnamed: 0,algorithm,dataset_type,R2,MSE,MAE,MAPE
0,simple_regression,train,0.75968,34809580.0,4042.614403,0.397542
0,simple_regression,test,0.730928,40273910.0,4365.496447,0.41614


# Combine with classifier

## fit clf

In [26]:
clf = RandomForestClassifier(random_state=2025)
clf.fit(scaled_df_train[linear_features], scaled_df_train[clf_target])

clf_pred_train = clf.predict(scaled_df_train[linear_features])
clf_pred_test = clf.predict(scaled_df_test[linear_features])

In [27]:
print(metrics.classification_report(scaled_df_train[clf_target], clf_pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       663
           1       1.00      1.00      1.00       233

    accuracy                           1.00       896
   macro avg       1.00      1.00      1.00       896
weighted avg       1.00      1.00      1.00       896



In [28]:
print(metrics.classification_report(scaled_df_test[clf_target], clf_pred_test))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95       317
           1       0.97      0.76      0.85       125

    accuracy                           0.93       442
   macro avg       0.94      0.88      0.90       442
weighted avg       0.93      0.93      0.92       442



## fit regression

In [29]:
scaled_df_train_0 = scaled_df_train[scaled_df_train[clf_target] == 0]
scaled_df_train_1 = scaled_df_train[scaled_df_train[clf_target] == 1]

clf_pred_test = clf.predict(scaled_df_test[linear_features])
scaled_df_test['clf_pred'] = clf_pred_test

scaled_df_test_0 = scaled_df_test[scaled_df_test['clf_pred'] == 0]
scaled_df_test_1 = scaled_df_test[scaled_df_test['clf_pred'] == 1]

In [30]:
scaled_df_train_0.shape, scaled_df_train_1.shape

((663, 10), (233, 10))

In [31]:
scaled_df_test_0.shape, scaled_df_test_1.shape

((344, 11), (98, 11))

### lr for class 0

In [32]:
lr_0 = LinearRegression()
lr_0.fit(scaled_df_train_0[linear_features], scaled_df_train_0[target])
pred_train_0 = lr_0.predict(scaled_df_train_0[linear_features])
pred_test_0 = lr_0.predict(scaled_df_test_0[linear_features])

### lr for class 1

In [33]:
lr_1 = LinearRegression()
lr_1.fit(scaled_df_train_1[linear_features], scaled_df_train_1[target])
pred_train_1 = lr_1.predict(scaled_df_train_1[linear_features])
pred_test_1 = lr_1.predict(scaled_df_test_1[linear_features])

### collect all

In [34]:
result_metrics = get_metrics(pd.concat([scaled_df_train_0[target], scaled_df_train_1[target]]),
                             np.hstack([pred_train_0, pred_train_1]),
                              algorithm='segm_expert_regression', dataset_type='train', 
                              res_df=result_metrics)

segm_expert_regression, -- Type Dataset - train
R2 = 0.9085717069936483
MSE = 13243109.805573937
MAE = 1867.6645236179825
MAPE = 0.13168786401989083


In [35]:
result_metrics = get_metrics(pd.concat([scaled_df_test_0[target], scaled_df_test_1[target]]),
                             np.hstack([pred_test_0, pred_test_1]),
                              algorithm='segm_expert_regression', dataset_type='test', 
                              res_df=result_metrics)

segm_expert_regression, -- Type Dataset - test
R2 = 0.8103224674955269
MSE = 28390425.683407467
MAE = 2661.119615146242
MAPE = 0.16378599227672605


In [36]:
result_metrics.drop_duplicates()

Unnamed: 0,algorithm,dataset_type,R2,MSE,MAE,MAPE
0,simple_regression,train,0.75968,34809580.0,4042.614403,0.397542
0,simple_regression,test,0.730928,40273910.0,4365.496447,0.41614
0,segm_expert_regression,train,0.908572,13243110.0,1867.664524,0.131688
0,segm_expert_regression,test,0.810322,28390430.0,2661.119615,0.163786


# Catboost

In [37]:
catboost_features = num_features+cat_features

In [38]:
catboost_features

['age', 'bmi', 'children', 'sex', 'smoker', 'region']

In [40]:
cat_model = cat.CatBoostRegressor(random_seed = 2025, cat_features=cat_features, verbose=False)
cat_model.fit(df_train[catboost_features], df_train[target])

<catboost.core.CatBoostRegressor at 0x141a559d0>

In [41]:
y_train_pred = cat_model.predict(df_train[catboost_features])
y_test_pred = cat_model.predict(df_test[catboost_features])

In [42]:
result_metrics = get_metrics(df_train[target], y_train_pred,
                              algorithm='catboost_regression', dataset_type='train', 
                              res_df=result_metrics)

catboost_regression, -- Type Dataset - train
R2 = 0.9272280238678487
MSE = 10540799.122430652
MAE = 1773.7608811388407
MAPE = 0.21184293095322945


In [43]:
result_metrics = get_metrics(df_test[target], y_test_pred,
                              algorithm='catboost_regression', dataset_type='test', 
                              res_df=result_metrics)

catboost_regression, -- Type Dataset - test
R2 = 0.8437469040628056
MSE = 23387545.427409224
MAE = 2644.622447903881
MAPE = 0.2698564811376259


In [44]:
result_metrics 

Unnamed: 0,algorithm,dataset_type,R2,MSE,MAE,MAPE
0,simple_regression,train,0.75968,34809580.0,4042.614403,0.397542
0,simple_regression,test,0.730928,40273910.0,4365.496447,0.41614
0,segm_expert_regression,train,0.908572,13243110.0,1867.664524,0.131688
0,segm_expert_regression,test,0.810322,28390430.0,2661.119615,0.163786
0,catboost_regression,train,0.927228,10540800.0,1773.760881,0.211843
0,catboost_regression,test,0.843747,23387550.0,2644.622448,0.269856
