# Solving classification problems with CatBoost

In this tutorial we will use dataset Amazon Employee Access Challenge from [Kaggle](https://www.kaggle.com) competition for our experiments. Data can be downloaded [here](https://www.kaggle.com/c/amazon-employee-access-challenge/data).

In [None]:
import os
import pandas as pd
import numpy as np
np.set_printoptions(precision=4)

import catboost
print(catboost.__version__)

## Reading the data

In [None]:
from catboost.datasets import amazon

train_df, test_df = amazon()

train_df.head()

## Preparing the data

Label values extraction

In [None]:
y = train_df.ACTION
X = train_df.drop('ACTION', axis=1)

Categorical features declaration

In [None]:
cat_features = list(range(0, X.shape[1]))
print(cat_features)

Looking on label balance in dataset

In [None]:
print('Labels: {}'.format(set(y)))
print('Zero count = {}, One count = {}'.format(len(y) - sum(y), sum(y)))

Split your data into train and validation

In [None]:
from sklearn.model_selection import train_test_split
from catboost import Pool

data = train_test_split(X, y, train_size=0.8, random_state=0)
X_train, X_validation, y_train, y_validation = data

train_pool = Pool(
    data=X_train, 
    label=y_train, 
    cat_features=cat_features
)

validation_pool = Pool(
    data=X_validation, 
    label=y_validation, 
    cat_features=cat_features
)

print('Train dataset shape: {}\n'.format(train_pool.shape))

## Training

In [None]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(
    iterations=5,
    learning_rate=0.1,
)
model.fit(train_pool, eval_set=validation_pool, verbose=False)

print('Model is fitted: {}'.format(model.is_fitted()))
print('Model params:\n{}'.format(model.get_params()))

## Stdout of the training

In [None]:
model = CatBoostClassifier(
    iterations=15,
#     verbose=5,
)
model.fit(train_pool, eval_set=validation_pool);

## Metrics calculation and graph plotting

In [None]:
model = CatBoostClassifier(
    iterations=200,
    learning_rate=0.5,
    custom_loss=['AUC', 'Accuracy']
)

model.fit(
    train_pool,
    eval_set=validation_pool,
    verbose=False,
    plot=True
);

## Model comparison

In [None]:
model1 = CatBoostClassifier(
    learning_rate=0.7,
    iterations=100,
    train_dir='learing_rate_0.7'
)

model2 = CatBoostClassifier(
    learning_rate=0.01,
    iterations=100,
    train_dir='learing_rate_0.01'
)

model1.fit(train_pool, eval_set=validation_pool, verbose=False)
model2.fit(train_pool, eval_set=validation_pool, verbose=False);

In [None]:
from catboost import MetricVisualizer
MetricVisualizer(['learing_rate_0.7', 'learing_rate_0.01']).start()

## Best iteration

In [None]:
model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.5,
#     use_best_model=False
)
model.fit(
    train_pool,
    eval_set=validation_pool,
    verbose=False,
    plot=True
);

In [None]:
print('Tree count: ' + str(model.tree_count_))

## Overfitting Detector

In [None]:
model_with_early_stop = CatBoostClassifier(
    iterations=200,
    learning_rate=0.5,
    early_stopping_rounds=20
)

model_with_early_stop.fit(
    train_pool,
    eval_set=validation_pool,
    verbose=False,
    plot=True
);

In [None]:
print(model_with_early_stop.tree_count_)

### Overfitting Detector with eval metric

In [None]:
model_with_early_stop = CatBoostClassifier(
    eval_metric='AUC',
    iterations=200,
    learning_rate=0.5,
    early_stopping_rounds=20
)
model_with_early_stop.fit(
    train_pool,
    eval_set=validation_pool,
    verbose=False,
    plot=True
);

In [None]:
print(model_with_early_stop.tree_count_)

## Cross-validation

In [None]:
from catboost import cv

params = {
    'loss_function': 'Logloss',
    'iterations': 80,
    'custom_loss': 'AUC',
    'learning_rate': 0.5,
}

cv_data = cv(
    params = params,
    pool = train_pool,
    fold_count=5,
    shuffle=True,
    partition_random_seed=0,
    plot=True,
    stratified=True,
    verbose=False
)

In [None]:
cv_data.head(10)

In [None]:
best_value = cv_data['test-Logloss-mean'].min()
best_iter = cv_data['test-Logloss-mean'].values.argmin()

print('Best validation Logloss score: {:.4f}±{:.4f} on step {}'.format(
    best_value,
    cv_data['test-Logloss-std'][best_iter],
    best_iter)
)

## Grid Search

In [None]:
model = CatBoostClassifier(iterations=10, eval_metric='AUC')
grid = {'learning_rate': [0.001, 0.01, 0.1, 1.0, 10.0], 'depth': [4, 5, 6]}
result = model.grid_search(grid, train_pool)

In [None]:
print('Best parameters: {}\n'.format(result['params']))

msg = 'Mean AUC value on validation set per each iteration:\n{}'
print(msg.format(np.round(result['cv_results']['test-AUC-mean'], 4)))

In [None]:
model.get_params()

In [None]:
model.predict(validation_pool)

In [None]:
model.grid_search(grid, train_pool, plot=True, verbose=False);

More about parameter tuning you can find in [tutorial](https://github.com/catboost/catboost/blob/master/catboost/tutorials/hyperparameters_tuning/hyperparameters_tuning.ipynb).

## Model predictions

In [None]:
model = CatBoostClassifier(iterations=200, learning_rate=0.03)

model.fit(
    train_pool,
    verbose=False,
    plot=True
);

In [None]:
print(model.predict_proba(X_validation))

In [None]:
raw_pred = model.predict(
    X_validation,
    prediction_type='RawFormulaVal'
)
 
print(raw_pred)

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
 
probabilities = sigmoid(raw_pred)
 
print(probabilities)

In [None]:
print(model.predict(X_validation))

## Select decision boundary

![](https://habrastorage.org/webt/y4/1q/yq/y41qyqfm9mcerp2ziys48phpjia.png)

In [None]:
import matplotlib.pyplot as plt
from catboost.utils import get_roc_curve
from catboost.utils import get_fpr_curve
from catboost.utils import get_fnr_curve

curve = get_roc_curve(model, validation_pool)
(fpr, tpr, thresholds) = curve

(thresholds, fpr) = get_fpr_curve(curve=curve)
(thresholds, fnr) = get_fnr_curve(curve=curve)

In [None]:
plt.figure(figsize=(16, 8))
style = {'alpha':0.5, 'lw':2}

plt.plot(thresholds, fpr, color='blue', label='FPR', **style)
plt.plot(thresholds, fnr, color='green', label='FNR', **style)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.xlabel('Threshold', fontsize=16)
plt.ylabel('Error Rate', fontsize=16)
plt.title('FPR-FNR curves', fontsize=20)
plt.legend(loc="lower left", fontsize=16);

In [None]:
from catboost.utils import select_threshold

print(select_threshold(model, validation_pool, FNR=0.01))
print(select_threshold(model, validation_pool, FPR=0.01))

## Metric evaluation on a new dataset

In [None]:
metrics = model.eval_metrics(
    data=validation_pool,
    metrics=['Logloss','AUC'],
    plot=True
)

In [None]:
print('AUC values:\n{}'.format(np.array(metrics['AUC'])))

## Feature importances

### Prediction values change

In [None]:
model.get_feature_importance(prettified=True)

### Loss function change

In [None]:
model.get_feature_importance(
    validation_pool, 
    'LossFunctionChange', 
    prettified=True
)

### Shap values

In [None]:
shap_values = model.get_feature_importance(
    train_pool, 
    'ShapValues'
)

expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]

print(shap_values.shape)

In [None]:
import shap

shap.initjs()
shap.force_plot(
    expected_value,
    shap_values[2,:],
    feature_names=train_pool.get_feature_names()
)

In [None]:
shap.force_plot(
    expected_value,
    shap_values[0,:],
    feature_names=train_pool.get_feature_names()
)

In [None]:
shap.summary_plot(shap_values, X_train)

More information about shap value usage you can find in [tutorial](https://github.com/catboost/catboost/blob/master/catboost/tutorials/model_analysis/shap_values_tutorial.ipynb).

## Tree Visualization

In [None]:
model = CatBoostClassifier(iterations=2, depth=1)

features = [
    [1, 2, 3], 
    [4, 5, 6],
    [7, 8, 9]
]
labels = [1, 0, 1]

model.fit(features, labels, verbose=False);

In [None]:
from IPython.display import display

display(model.plot_tree(0))
display(model.plot_tree(1))

In [None]:
x = [0, 3, 2]

raw_pred = model.predict(x, prediction_type='RawFormulaVal')
print(raw_pred)

## Snapshotting

In [None]:
# !rm 'catboost_info/snapshot.bkp'

model = CatBoostClassifier(
    iterations=200,
    save_snapshot=True,
    snapshot_file='snapshot.bkp',
    snapshot_interval=1
)

model.fit(train_pool, eval_set=validation_pool, verbose=10);

## Saving the model

In [None]:
model = CatBoostClassifier(iterations=10)
model.fit(train_pool, eval_set=validation_pool, verbose=False)
model.save_model('catboost_model.bin')
model.save_model('catboost_model.json', format='json')

In [None]:
model.load_model('catboost_model.bin')
print(model.get_params())
print(model.learning_rate_)