## Libraries installation

In [1]:
#!pip install catboost
#!pip install ipywidgets
#!pip install shap
#!pip install sklearn
#!jupyter nbextension enable --py widgetsnbextension

In [3]:
#pip install notebook==5.7.10


In [5]:
#!jupyter nbextension enable --py widgetsnbextension

In [108]:
import os
import pandas as pd
import numpy as np

import catboost
print(catboost.__version__)

1.2.7


## Reading the data

In [111]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [113]:
df.shape

(1338, 7)

In [115]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## Exploring the data

In [118]:
y = df['charges']
X = df.drop('charges', axis=1)

In [120]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


Categorical features declaration

In [123]:
cat_features = [1,3,4,5]
print(cat_features)

[1, 3, 4, 5]


# Training the first model

In [165]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(iterations=1000)

model.fit(X, y, cat_features=cat_features, verbose=100)

Learning rate set to 0.042871
0:	learn: 11709.2816041	total: 18.9ms	remaining: 18.9s
100:	learn: 4295.6700280	total: 1.57s	remaining: 14s
200:	learn: 4113.1903316	total: 3.14s	remaining: 12.5s
300:	learn: 3959.1778653	total: 5.05s	remaining: 11.7s
400:	learn: 3831.9363912	total: 6.69s	remaining: 9.99s
500:	learn: 3724.8842689	total: 8.35s	remaining: 8.32s
600:	learn: 3628.5299335	total: 10.3s	remaining: 6.84s
700:	learn: 3536.5601629	total: 12.3s	remaining: 5.26s
800:	learn: 3445.5483997	total: 14.3s	remaining: 3.55s
900:	learn: 3354.0648055	total: 16s	remaining: 1.76s
999:	learn: 3260.4124513	total: 17.6s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x24caacc7920>

In [167]:
model.predict(X)

array([17889.07616493,  4470.39528501,  5408.04012112, ...,
        2141.43028434,  2629.03072533, 28891.10084048])

# Working with dataset

There are several ways of passing dataset to training - using X,y (the initial matrix) or using Pool class.
Pool class is the class for storing the dataset. In the next few blocks we'll explore the ways to create a Pool object.

You can use Pool class if the dataset has more than just X and y (for example, it has sample weights or groups) or if the dataset is large and it takes long time to read it into python.

In [169]:
from catboost import Pool
pool = Pool(data=X, label=y, cat_features=cat_features)

## Split your data into train and validation

In [171]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

train_pool = Pool(
    data=X_train,
    label=y_train,
    cat_features=cat_features
)

validation_pool = Pool(
    data=X_test,
    label=y_test,
    cat_features=cat_features
)

## Stdout of the training

In [177]:
model = CatBoostRegressor(
    iterations=15,
    verbose=5,
)
model.fit(train_pool, eval_set=validation_pool);

Learning rate set to 0.5
0:	learn: 7608.2906506	test: 7667.5765651	best: 7667.5765651 (0)	total: 9.46ms	remaining: 132ms
5:	learn: 4604.5635186	test: 4106.7789446	best: 4106.7789446 (5)	total: 109ms	remaining: 164ms
10:	learn: 4429.2010329	test: 4065.7452484	best: 4039.2627286 (8)	total: 167ms	remaining: 60.7ms
14:	learn: 4259.1387065	test: 4064.9212176	best: 4039.2627286 (8)	total: 224ms	remaining: 0us

bestTest = 4039.262729
bestIteration = 8

Shrink model to first 9 iterations.


## Cross-validation

In [146]:
from catboost import cv

params = {
    'loss_function': 'RMSE',
    'iterations': 5,
    # 'custom_loss': 'MAE',
    'learning_rate': 0.5,
}

cv_data = cv(
    params = params,
    pool = train_pool,
    fold_count=5,
    shuffle=True,
    plot=True,
    stratified=False
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]
0:	learn: 11020.4170379	test: 10229.5747867	best: 10229.5747867 (0)	total: 18.2ms	remaining: 72.8ms
1:	learn: 7564.1037916	test: 7145.8261298	best: 7145.8261298 (1)	total: 39.5ms	remaining: 59.2ms
2:	learn: 5706.4232752	test: 5673.1239325	best: 5673.1239325 (2)	total: 50ms	remaining: 33.3ms
3:	learn: 4959.6268650	test: 5203.0959648	best: 5203.0959648 (3)	total: 57.2ms	remaining: 14.3ms
4:	learn: 4702.2309638	test: 5082.9447107	best: 5082.9447107 (4)	total: 68.5ms	remaining: 0us

bestTest = 5082.944711
bestIteration = 4

Training on fold [1/5]
0:	learn: 10816.7648860	test: 11472.5918550	best: 11472.5918550 (0)	total: 9ms	remaining: 36ms
1:	learn: 7553.0616211	test: 8002.3406266	best: 8002.3406266 (1)	total: 21.8ms	remaining: 32.7ms
2:	learn: 5718.3298783	test: 5962.5792864	best: 5962.5792864 (2)	total: 30.4ms	remaining: 20.3ms
3:	learn: 5046.0593431	test: 5171.2657168	best: 5171.2657168 (3)	total: 45.3ms	remaining: 11.3ms
4:	learn: 4779.3942409	test: 4823.3890861	

In [148]:
cv_data

Unnamed: 0,iterations,test-RMSE-mean,test-RMSE-std,train-RMSE-mean,train-RMSE-std
0,0,10910.552582,1079.964769,10858.930456,177.928727
1,1,7688.131595,759.830751,7532.286916,103.499961
2,2,5878.931108,475.681733,5702.507158,79.94988
3,3,5197.244618,331.709921,5009.393445,76.115483
4,4,4949.841276,291.660952,4760.460059,72.409281


In [150]:
best_value = np.min(cv_data['test-RMSE-mean'])
best_iter = np.argmin(cv_data['test-RMSE-mean'])

print('Best validation RMSE score: {:.4f}±{:.4f} on iteration {}'.format(
    best_value,
    cv_data['test-RMSE-mean'][best_iter],
    best_iter)
)

Best validation RMSE score: 4949.8413±4949.8413 on iteration 4


## Hyperparameter Tuning

In [152]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "learning_rate": [0.001, 0.01, 0.5],
}

clf = CatBoostRegressor(
    iterations=20,
    cat_features=cat_features,
    verbose=20
)
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=3)
results = grid_search.fit(X_train, y_train)
results.best_estimator_.get_params()

0:	learn: 12101.1885964	total: 13.8ms	remaining: 263ms
19:	learn: 11945.5933585	total: 280ms	remaining: 0us
0:	learn: 11968.0006121	total: 11ms	remaining: 210ms
19:	learn: 11808.9334478	total: 234ms	remaining: 0us
0:	learn: 11816.0675780	total: 11.7ms	remaining: 222ms
19:	learn: 11653.8141827	total: 374ms	remaining: 0us
0:	learn: 12021.0930219	total: 9.89ms	remaining: 188ms
19:	learn: 10563.0000735	total: 255ms	remaining: 0us
0:	learn: 11893.2730390	total: 12.4ms	remaining: 236ms
19:	learn: 10441.0750438	total: 255ms	remaining: 0us
0:	learn: 11738.7280820	total: 10.9ms	remaining: 206ms
19:	learn: 10253.6015209	total: 271ms	remaining: 0us
0:	learn: 7978.5386760	total: 10.4ms	remaining: 197ms
19:	learn: 4298.3894185	total: 314ms	remaining: 0us
0:	learn: 8058.9321400	total: 13.9ms	remaining: 264ms
19:	learn: 4061.8040490	total: 303ms	remaining: 0us
0:	learn: 7780.7621946	total: 13.8ms	remaining: 262ms
19:	learn: 3880.3716319	total: 299ms	remaining: 0us
0:	learn: 7608.2906506	total: 8.02ms

{'iterations': 20,
 'loss_function': 'RMSE',
 'verbose': 20,
 'cat_features': [1, 3, 4, 5],
 'learning_rate': 0.5}