# Training Gradient Boosting Models with CatBoost

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/catboost/tutorials/blob/master/events/2020_06_04_catboost_tutorial/catboost_features.ipynb)

## Libraries installation

In [None]:
# # For Google Colaboratory:
# !pip install catboost sklearn shap

# # For your machine:
# !pip install --user -U ipywidgets catboost sklearn shap
# !jupyter nbextension enable --py widgetsnbextension

In [None]:
import os
import pandas as pd
import numpy as np
np.set_printoptions(precision=4)

import catboost
print(catboost.__version__)

## Reading the data

In [None]:
from catboost.datasets import msrank_10k

# If you have "URLError: SSL: CERTIFICATE_VERIFY_FAILED" uncomment next two lines:
# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context

(train_df, test_df) = msrank_10k()

In [None]:
train_df.head()

## Preparing the data

Label values extraction

In [None]:
y = train_df[0]
X = train_df.drop([0, 1], axis=1)

Ways to create Pool class. If you have a big dataset it is effective (in terms of time) to load data from file, instead of pandas Dataframe.

In [None]:
dataset_dir = './msrank_10k'
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

train_df.to_csv(
    os.path.join(dataset_dir, 'train.csv'),
    index=False, sep=',', header=True
)
test_df.to_csv(
    os.path.join(dataset_dir, 'test.csv'),
    index=False, sep=',', header=True
)

In [None]:
!head -2 msrank_10k/train.csv

In [None]:
from catboost.utils import create_cd
feature_names = dict(map(lambda i: (i, 'Feature ' + str(i)), range(train_df.shape[1] - 2)))
    
create_cd(
    label=0,
    feature_names=feature_names,
    auxiliary_columns=[1],
    output_path=os.path.join(dataset_dir, 'train.cd')
)

In [None]:
!head msrank_10k/train.cd

In [None]:
from catboost import Pool

pool1 = Pool(data=X, label=y)

pool2 = Pool(
    data=os.path.join(dataset_dir, 'train.csv'), 
    delimiter=',', 
    column_description=os.path.join(dataset_dir, 'train.cd'),
    has_header=True,
)

print('Dataset shape: {}\n'.format(pool1.shape))

## Split your data into train and validation

In [None]:
from sklearn.model_selection import train_test_split

data = train_test_split(X, y, train_size=0.8, random_state=0)
X_train, X_validation, y_train, y_validation = data

train_pool = Pool(data=X_train, label=y_train)
validation_pool = Pool(data=X_validation, label=y_validation)

## Dataset Quantization

Features quantization. It is effective to quantize features single time before several trainings.

In [None]:
train_pool.quantize(
    border_count=254,
    # per_float_feature_quantization=['0:border_count=1024']
)

train_pool.save_quantization_borders('borders.tsv')

validation_pool.quantize(input_borders='borders.tsv')

In [None]:
from catboost.utils import quantize

pool2 = quantize(
    data_path=os.path.join(dataset_dir, 'train.csv'),
    delimiter=',',
    column_description=os.path.join(dataset_dir, 'train.cd'),
    has_header=True,
)

## Training

In [None]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(
    iterations=5,
    learning_rate=0.1,
)
model.fit(train_pool, eval_set=validation_pool, verbose=False)

print('Model is fitted: {}'.format(model.is_fitted()))
print('Model params:\n{}'.format(model.get_params()))

## Stdout of the training

In [None]:
model = CatBoostRegressor(
    iterations=15,
#     verbose=5,
)
model.fit(train_pool, eval_set=validation_pool);

## Metrics calculation and graph plotting

In [None]:
model = CatBoostRegressor(
    iterations=200,
    learning_rate=0.2,
    custom_metric=['MAE', 'R2']
)

model.fit(
    train_pool,
    eval_set=validation_pool,
    verbose=False,
    plot=True
);

## Best iteration

In [None]:
model = CatBoostRegressor(
    iterations=100,
    eval_metric='MAE',
    learning_rate=0.5,
#     use_best_model=False
)
model.fit(
    train_pool,
    eval_set=validation_pool,
    verbose=False,
    plot=True
);

In [None]:
print('Tree count: ' + str(model.tree_count_))

## Grid Search

In [None]:
pool = Pool(data=X_train, label=y_train)
model = CatBoostRegressor(iterations=10, eval_metric='MAE')
grid = {'learning_rate': [0.001, 0.01, 0.1], 'depth': [4, 5, 6]}
result = model.grid_search(grid, pool)

In [None]:
print('Best parameters: {}\n'.format(result['params']))

msg = 'Mean MAE value on validation set per each iteration:\n{}'
print(msg.format(np.round(result['cv_results']['test-MAE-mean'], 4)))

In [None]:
model.get_params()

In [None]:
model.predict(validation_pool)

In [None]:
model = CatBoostRegressor(iterations=100, eval_metric='MAE')
model.grid_search(grid, pool, plot=True, verbose=False);

More about parameter tuning you can find in [tutorial](https://github.com/catboost/catboost/blob/master/catboost/tutorials/hyperparameters_tuning/hyperparameters_tuning.ipynb).

## Feature importances

### Prediction values change

In [None]:
model.get_feature_importance(prettified=True).head()

### Loss function change

In [None]:
model.get_feature_importance(
    validation_pool, 
    'LossFunctionChange',
    prettified=True
).head()

### Shap values

In [None]:
model = CatBoostRegressor(iterations=1000, learning_rate=0.1)
model.fit(
    train_pool,
    eval_set=validation_pool,
    verbose=False,
    plot=True
);


shap_values = model.get_feature_importance(
    train_pool, 
    'ShapValues'
)

expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]

print(shap_values.shape)

In [None]:
import shap

shap.initjs()
shap.force_plot(
    expected_value,
    shap_values[1,:],
    feature_names=train_pool.get_feature_names()
)

In [None]:
shap.force_plot(
    expected_value,
    shap_values[7,:],
    feature_names=train_pool.get_feature_names()
)

In [None]:
shap.summary_plot(shap_values, X_train)

More information about shap value usage you can find in [tutorial](https://github.com/catboost/catboost/blob/master/catboost/tutorials/model_analysis/shap_values_tutorial.ipynb).

## Tree Visualization

In [None]:
model = CatBoostRegressor(iterations=2, depth=1, boost_from_average=False)

features = [
    [1, 2, 3], 
    [4, 5, 6],
    [7, 8, 9]
]
labels = [1, 0, 1]

model.fit(features, labels, verbose=False);

In [None]:
# This cell doesn't work without graphviz package
# You can install it by link https://graphviz.gitlab.io/download/
# Installation can take a lot of time. You can do it at home.

from IPython.display import display

display(model.plot_tree(0))
display(model.plot_tree(1))

In [None]:
x = [0, 7, 2]

raw_pred = model.predict([x])
print(raw_pred)

## Grow Policy

In [None]:
model = CatBoostRegressor(iterations=2, depth=4, grow_policy='Depthwise')
model.fit(features, labels, verbose=False);
display(model.plot_tree(0))

In [None]:
model = CatBoostRegressor(iterations=2, max_leaves=16, grow_policy='Lossguide')
model.fit(features, labels, verbose=False);
display(model.plot_tree(0))

## Snapshotting

In [None]:
# !rm 'catboost_info/snapshot.bkp'

model = CatBoostRegressor(
    iterations=200,
    save_snapshot=True,
    snapshot_file='snapshot.bkp',
    snapshot_interval=1
)

model.fit(train_pool, eval_set=validation_pool, verbose=10);

## Saving the model

In [None]:
model = CatBoostRegressor(iterations=10)
model.fit(train_pool, eval_set=validation_pool, verbose=False)
model.save_model('catboost_model.bin')
model.save_model('catboost_model.json', format='json')

In [None]:
model.load_model('catboost_model.bin')
print(model.get_params())
print(model.learning_rate_)