In [None]:
import pandas as pd
import numpy as np
import catboost as cb
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')


In [None]:
# read in the train and test data from csv files
file_name = 'data.csv'
colnames = ['col1','col2','col3','col4','label']
category_cols = ['col2','col2']

LABEL = 'label'
TASK = 'CPU'
TYPE = 'classification' # [classification, regression, multiclass]
SPLIT = 0.3

In [None]:
data = pd.read_csv(file_name, usecols=colnames)
print(data.shape)
data.head()

In [None]:

# convert categorical columns to integers
cat_dims = [data.columns.get_loc(i) for i in category_cols[:-1]] 
for header in category_cols:
    data[header] = data[header].astype('category').cat.codes

X = data.drop(LABEL, axis=1)
Y = data[LABEL]

if TYPE == 'regression':
    train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=SPLIT)
    class_weights = None
else:
    train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=SPLIT, stratify=Y)
    class_weights = compute_class_weight('balanced', np.unique(train_y), train_y)

train_pool = cb.Pool(train_x, label=train_y, cat_features=category_cols)
test_pool = cb.Pool(test_x, label=test_y, cat_features=category_cols)

In [None]:
if TYPE == 'regression':
    loss = 'MAE'
    eval = ['RMSE', 'MAPE', 'MAE', 'R2', 'MedianAbsoluteError']
    metric = 'RMSE'
elif TYPE == 'classification':
    loss = 'Logloss'
    eval = ['Logloss', 'CrossEntropy', 'Precision', 'Recall', 'F1', 'Accuracy', 'AUC']
    metric = 'Accuracy'
elif TYPE == 'multiclass':
    loss = 'MultiClassOneVsAll'
    eval = ['MultiClass', 'MultiClassOneVsAll', 'Precision', 'Recall', 'F1', 'Accuracy', 'AUC']
    metric = 'Accuracy'

params = {'depth':[3,1,2,6,4,5,7,8,9,10],
          'iterations':[50,100,250,500,1000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[3,1,5,10,50],
          'border_count':[32,5,10,20,50,100,200],
          }

In [None]:
model = cb.CatBoost({
            'thread_count':-1, 
            'task_type': TASK,
            'loss_function': loss,
            'eval_metric': metric,
            'class_weights': class_weights
})

search_result = model.randomized_search(params, train_pool, n_iter=100, cv=3, plot=True)

In [None]:
search_result['cv_results']

In [None]:
best = search_result['params']
best

In [None]:
if TYPE == 'regression':
    model = CatBoostRegressor(**{
            'thread_count':-1, 
            'loss_function': loss,
            'eval_metric': metric,
    })
elif (TYPE == 'classification') or (TYPE=='multiclass'):
    model = CatBoostClassifier(**{
            'thread_count':-1, 
            'loss_function': loss,
            'eval_metric': metric,
            'class_weights': class_weights
    })

model.set_params(**best)
model.fit(train_pool, plot=True)

In [None]:
results = model.eval_metrics(test_pool, eval)
for e in results.keys():
    print(e, results[e][-1])

In [None]:
model.save_model('model.cbm')

In [None]:
model.get_feature_importance(type='FeatureImportance', prettified=True).set_index('Feature Id')

In [None]:
model.get_feature_importance(type='FeatureImportance', prettified=True).set_index('Feature Id').plot(kind='bar', figsize=(15, 8))

In [None]:
predictions = model.predict(test_pool)
if TYPE == 'regression':
    preds = [np.round(np.max(p, 0), 2) for p in predictions] # assert prediciton > 0
    diff = np.abs(predictions - test_y)

    mean = np.mean(diff)
    stdev = np.std(diff)
    median = np.median(diff)
    max_ = np.max(diff)
    min_ = np.min(diff)

    print('Error:')
    print('mean', mean)
    print('st dev', stdev)
    print('median', median)
    print('max', max_)
    print('min', min_)

    outlier = 10
    print(f'more than {outlier} days', np.round(len(diff[diff > outlier])/len(diff)*100, 3), '%')
    plt.hist(diff[diff < outlier], bins=outlier)
    height = diff.value_counts()[0]
    plt.vlines(mean, 0, height, label='mean', color='r')
    plt.vlines(median, 0, height, label='median', color='g')
    plt.legend()
    plt.title('Error Distribution')
    plt.show()

elif (TYPE == 'classification') or (TYPE == 'multiclass'):
    print("Classification Report")
    print(classification_report(test_y, predictions))

    cf = {'Actual': test_y, 'Predicted': predictions}
    cf_df = pd.DataFrame(data=cf)
    confusion_matrix = pd.crosstab(cf_df['Actual'], cf_df['Predicted'], rownames=['Actual'], colnames=['Predicted'])
    group_names = ['True Neg','False Pos','False Neg','True Pos']
    group_counts = ["{0:0.0f}".format(value) for value in
                    cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                        cf_matrix.flatten()/np.sum(cf_matrix)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
            zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(confusion_matrix, annot=labels, fmt='')
    plt.title('Confusion Matrix')
    plt.show()

