# Solving classification problems with CatBoost

In this tutorial we will use dataset Amazon Employee Access Challenge from [Kaggle](https://www.kaggle.com) competition for our experiments. Data can be downloaded [here](https://www.kaggle.com/c/amazon-employee-access-challenge/data).

## Libraries installation

In [None]:
#!pip install --user --upgrade catboost
#!pip install --user --upgrade ipywidgets
#!pip install shap
#!pip install sklearn
#!pip install --upgrade numpy
#!jupyter nbextension enable --py widgetsnbextension

In [None]:
import os
import pandas as pd
import numpy as np
np.set_printoptions(precision=4)

import catboost
print(catboost.__version__)

## Reading the data

In [None]:
from catboost.datasets import amazon

(train_df, test_df) = amazon()

In [None]:
train_df.head()

## Preparing the data

Label values extraction

In [None]:
y = train_df.ACTION
X = train_df.drop('ACTION', axis=1)

Categorical features declaration

In [None]:
cat_features = list(range(0, X.shape[1]))
print(cat_features)

Looking on label balance in dataset

In [None]:
print('Labels: {}'.format(set(y)))
print('Zero count = {}, One count = {}'.format(len(y) - sum(y), sum(y)))

Ways to create Pool class

In [None]:
dataset_dir = './amazon'
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

train_df.to_csv(
    os.path.join(dataset_dir, 'train.csv'),
    index=False, sep=',', header=True
)
test_df.to_csv(
    os.path.join(dataset_dir, 'test.csv'),
    index=False, sep=',', header=True
)

In [None]:
!head -3 amazon/train.csv

In [None]:
from catboost.utils import create_cd
feature_names = dict(list(enumerate(train_df.keys()[1:])))
    
create_cd(
    label=0,
    cat_features=list(range(1, train_df.shape[1])),
    feature_names=feature_names,
    output_path=os.path.join(dataset_dir, 'train.cd')
)

In [None]:
!cat amazon/train.cd

In [None]:
from catboost import Pool

pool1 = Pool(data=X, label=y, cat_features=cat_features)

pool2 = Pool(
    data=os.path.join(dataset_dir, 'train.csv'), 
    delimiter=',', 
    column_description=os.path.join(dataset_dir, 'train.cd'),
    has_header=True
)

print('Dataset shape: {}\n'.format(pool1.shape))
print('Column names: {}'.format(pool1.get_feature_names()))

In [None]:
from catboost import CatBoostClassifier

CatBoostClassifier(iterations=3).fit(pool1)
CatBoostClassifier(iterations=3).fit(pool2)
CatBoostClassifier(iterations=3).fit(X, y, cat_features=cat_features);

## Split your data into train and validation

In [None]:
from sklearn.model_selection import train_test_split

data = train_test_split(X, y, train_size=0.8, random_state=0)
X_train, X_validation, y_train, y_validation = data

train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
validation_pool = Pool(data=X_validation, label=y_validation, cat_features=cat_features)

## Selecting the objective function

Possible options for binary classification:

`Logloss`

`CrossEntropy` for probabilities in target

In [None]:
model = CatBoostClassifier(
    iterations=5,
    learning_rate=0.1,
    # loss_function='CrossEntropy'
)
model.fit(train_pool, eval_set=validation_pool, verbose=False)

print('Model is fitted: {}'.format(model.is_fitted()))
print('Model params:\n{}'.format(model.get_params()))

## Stdout of the training

In [None]:
model = CatBoostClassifier(
    iterations=15,
#     verbose=5,
)
model.fit(train_pool, eval_set=validation_pool);

## Metrics calculation and graph plotting

In [None]:
model = CatBoostClassifier(
    iterations=50,
    learning_rate=0.5,
    custom_loss=['AUC', 'Accuracy']
)

model.fit(
    train_pool,
    eval_set=validation_pool,
    verbose=False,
    plot=True
);

## Model comparison

In [None]:
model1 = CatBoostClassifier(
    learning_rate=0.7,
    iterations=100,
    train_dir='learing_rate_0.7'
)

model2 = CatBoostClassifier(
    learning_rate=0.01,
    iterations=100,
    train_dir='learing_rate_0.01'
)

model1.fit(train_pool, eval_set=validation_pool, verbose=False)
model2.fit(train_pool, eval_set=validation_pool, verbose=False);

In [None]:
from catboost import MetricVisualizer
MetricVisualizer(['learing_rate_0.01', 'learing_rate_0.7']).start()

## Best iteration

In [None]:
model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.5,
#     use_best_model=False
)
model.fit(
    train_pool,
    eval_set=validation_pool,
    verbose=False,
    plot=True
);

In [None]:
print('Tree count: ' + str(model.tree_count_))

## Cross-validation

In [None]:
from catboost import cv

params = {
    'loss_function': 'Logloss',
    'iterations': 80,
    'custom_loss': 'AUC',
    'learning_rate': 0.5,
}

cv_data = cv(
    params = params,
    pool = train_pool,
    fold_count=5,
    shuffle=True,
    partition_random_seed=0,
    plot=True,
    stratified=False,
    verbose=False
)

In [None]:
cv_data.head()

In [None]:
best_value = cv_data['test-Logloss-mean'].min()
best_iter = cv_data['test-Logloss-mean'].values.argmin()

print('Best validation Logloss score, not stratified: {:.4f}±{:.4f} on step {}'.format(
    best_value,
    cv_data['test-Logloss-std'][best_iter],
    best_iter)
)

In [None]:
cv_data = cv(
    params = params,
    pool = train_pool,
    fold_count=5,
    shuffle=True,
    partition_random_seed=0,
    plot=True,
    stratified=True,
    verbose=False
)

best_value = cv_data['test-Logloss-mean'].min()
best_iter = cv_data['test-Logloss-mean'].values.argmin()

print('Best validation Logloss score, stratified: {:.4f}±{:.4f} on step {}'.format(
    best_value,
    cv_data['test-Logloss-std'][best_iter],
    best_iter)
)

## Overfitting Detector

In [None]:
model_with_early_stop = CatBoostClassifier(
    iterations=200,
    learning_rate=0.5,
    early_stopping_rounds=20
)

model_with_early_stop.fit(
    train_pool,
    eval_set=validation_pool,
    verbose=False,
    plot=True
);

In [None]:
print(model_with_early_stop.tree_count_)

### Overfitting Detector with eval metric

In [None]:
model_with_early_stop = CatBoostClassifier(
    eval_metric='AUC',
    iterations=200,
    learning_rate=0.5,
    early_stopping_rounds=20
)
model_with_early_stop.fit(
    train_pool,
    eval_set=validation_pool,
    verbose=False,
    plot=True
);

In [None]:
print(model_with_early_stop.tree_count_)

## Sum Model

In [None]:
splitted_data = train_test_split(X_train, y_train, train_size=0.5, random_state=1234)
X_train_first, X_train_second, y_train_first, y_train_second = splitted_data

common_params = {
    'cat_features': cat_features,
    'eval_set': (X_validation, y_validation),
    'verbose': False,
}

model1 = CatBoostClassifier(iterations=200)
model2 = CatBoostClassifier(iterations=200)

model1.fit(X_train_first, y_train_first, **common_params)
model2.fit(X_train_second, y_train_second, **common_params);

In [None]:
from sklearn.metrics import roc_auc_score

preds1 = model1.predict(X_validation, prediction_type='Probability')[:, 1]
preds2 = model2.predict(X_validation, prediction_type='Probability')[:, 1]

print('preds1: {}'.format(roc_auc_score(y_validation, preds1)))
print('preds2: {}'.format(roc_auc_score(y_validation, preds2)))

In [None]:
from catboost import sum_models

model3 = sum_models((model1, model2))
preds3 = model3.predict(X_validation, prediction_type='Probability')[:, 1]
print('preds3: {}'.format(roc_auc_score(y_validation, preds3)))

## Select decision boundary

In [None]:
model = CatBoostClassifier(iterations=200, learning_rate=0.03)

model.fit(
    X_train, y_train,
    cat_features=cat_features,
    verbose=False,
    plot=True
);

![](https://habrastorage.org/webt/y4/1q/yq/y41qyqfm9mcerp2ziys48phpjia.png)

In [None]:
from catboost.utils import get_roc_curve
import sklearn
from sklearn import metrics

curve = get_roc_curve(model, validation_pool)
(fpr, tpr, thresholds) = curve
roc_auc = sklearn.metrics.auc(fpr, tpr)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 8))
style = {'alpha':0.5, 'lw':2}

title = 'ROC curve (area = {:.2f})'.format(roc_auc)
plt.plot(fpr, tpr, color='darkorange', label=title, **style)

plt.plot([0, 1], [0, 1], color='navy', linestyle='--', **style)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('Receiver operating characteristic', fontsize=20)
plt.legend(loc="lower right", fontsize=16);

In [None]:
from catboost.utils import get_fpr_curve
from catboost.utils import get_fnr_curve

(thresholds, fpr) = get_fpr_curve(curve=curve)
(thresholds, fnr) = get_fnr_curve(curve=curve)

In [None]:
plt.figure(figsize=(16, 8))
style = {'alpha':0.5, 'lw':2}

plt.plot(thresholds, fpr, color='blue', label='FPR', **style)
plt.plot(thresholds, fnr, color='green', label='FNR', **style)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.xlabel('Threshold', fontsize=16)
plt.ylabel('Error Rate', fontsize=16)
plt.title('FPR-FNR curves', fontsize=20)
plt.legend(loc="lower left", fontsize=16);

In [None]:
from catboost.utils import select_threshold

print(select_threshold(model, validation_pool, FNR=0.01))
print(select_threshold(model, validation_pool, FPR=0.01))

## Snapshotting

In [None]:
# !rm 'catboost_info/snapshot.bkp'

model = CatBoostClassifier(
    iterations=100,
    save_snapshot=True,
    snapshot_file='snapshot.bkp',
    snapshot_interval=1
)

model.fit(train_pool, eval_set=validation_pool, verbose=10);

## Model predictions

In [None]:
print(model.predict_proba(X_validation))

In [None]:
print(model.predict(X_validation))

In [None]:
raw_pred = model.predict(
    X_validation,
    prediction_type='RawFormulaVal'
)

print(raw_pred)

In [None]:
from numpy import exp

sigmoid = lambda x: 1 / (1 + exp(-x))

probabilities = sigmoid(raw_pred)

print(probabilities)

In [None]:
from catboost import FeaturesData

X_prepared = X_validation.values.astype(str).astype(object)
# For FeaturesData class categorial features must have type str

fast_predictions = model.predict_proba(
    FeaturesData(
        cat_feature_data=X_prepared,
        cat_feature_names=list(X_validation)
    )
)
print(fast_predictions)

## Staged prediction

In [None]:
predictions_gen = model.staged_predict_proba(
    X_validation,
    ntree_start=0, 
    ntree_end=3
)

try:
    for iteration, predictions in enumerate(predictions_gen):
        print('Iteration ' + str(iteration) + ', predictions:')
        print(predictions)
except Exception:
    pass

## Solving MultiClassification problem

In [None]:
model = CatBoostClassifier(loss_function='MultiClass', iterations=50)

model.fit(
    train_pool,
    eval_set=validation_pool,
    verbose=False,
    plot=True
);

## Metric evaluation on a new dataset

In [None]:
model = CatBoostClassifier(iterations=200, learning_rate=0.03)

model.fit(train_pool, verbose=50);

In [None]:
metrics = model.eval_metrics(
    data=validation_pool,
    metrics=['Logloss','AUC'],
    ntree_start=0,
    ntree_end=0,
    eval_period=1,
    plot=True
)

In [None]:
print('AUC values:\n{}'.format(np.array(metrics['AUC'])))


## Feature importances

In [None]:
model.get_feature_importance(prettified=True)

## Shap values

In [None]:
shap_values = model.get_feature_importance(pool1, type='ShapValues')
print(shap_values.shape)

In [None]:
import shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(Pool(X, y, cat_features=cat_features))

shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[10,:], X.iloc[10,:])

In [None]:
import shap
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[91,:], X.iloc[91,:])

In [None]:
shap.summary_plot(shap_values, X)

## Feature evaluation

In [None]:
from catboost.eval.catboost_evaluation import *

params = {
    'iterations': 20, # 2000
    'learning_rate': 0.5, # we set big learning_rate,
                          # because we have small
                          # #iterations
    'verbose': False,
    'loss_function' : 'Logloss',
    'boosting_type': 'Plain',
}

evaluator = CatboostEvaluation(
    'amazon/train.csv',
    fold_size=10000, # <= 50% of dataset
    fold_count=20,
    column_description='amazon/train.cd',
    partition_random_seed=0,
    delimiter=',',
    has_header=True,
    #working_dir=... 
)

result = evaluator.eval_features(
    learn_config=params,
    eval_metrics=['Logloss', 'Accuracy'],
    features_to_eval=[6, 7, 8]
)

In [None]:
logloss_result = result.get_metric_results('Logloss')
logloss_result.get_baseline_comparison()

## Saving the model

In [None]:
model = CatBoostClassifier(iterations=10)
model.fit(train_pool, eval_set=validation_pool, verbose=False)
model.save_model('catboost_model.bin')
model.save_model('catboost_model.json', format='json')

In [None]:
model.load_model('catboost_model.bin')
print(model.get_params())
print(model.learning_rate_)

## Hyperparameter tunning

### Training speed

In [None]:
from catboost import CatBoost
fast_model = CatBoostClassifier(
    iterations=150,
    learning_rate=0.01,
    boosting_type='Plain',
    bootstrap_type='Bernoulli',
    subsample=0.5,
    one_hot_max_size=20,
    rsm=0.5,
    leaf_estimation_iterations=5,
    max_ctr_complexity=1)

fast_model.fit(
    X_train, y_train,
    cat_features=cat_features,
    verbose=False,
    plot=True
);

### Accuracy

In [None]:
tunned_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.03,
    l2_leaf_reg=3,
    bagging_temperature=1,
    random_strength=1,
    one_hot_max_size=2,
    leaf_estimation_method='Newton'
)
tunned_model.fit(
    X_train, y_train,
    cat_features=cat_features,
    verbose=False,
    eval_set=(X_validation, y_validation),
    plot=True
);

## Training the model after parameter tunning

In [None]:
best_model = CatBoostClassifier(iterations=int(tunned_model.tree_count_ * 1.2))
best_model.fit(
    X, y,
    cat_features=cat_features,
    verbose=100
);

## Prepare the submission

In [None]:
X_test = test_df.drop('id', axis=1)
test_pool = Pool(data=X_test, cat_features=cat_features)
contest_predictions = best_model.predict_proba(test_pool)
print('Predictoins:')
print(contest_predictions)

## Calculate predictions for the contest

In [None]:
f = open('submit.csv', 'w')
f.write('Id,Action\n')
for idx in range(len(contest_predictions)):
    line = str(test_df['id'][idx]) + ',' + str(contest_predictions[idx][1]) + '\n'
    f.write(line)
f.close()

Submit your solution [here](https://www.kaggle.com/c/amazon-employee-access-challenge/submit).
Good luck!!!