In [None]:
import pandas
import numpy as np
import catboost as cb

from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV

In [None]:
# read in the train and test data from csv files
file_name = 'data.csv'
colnames = ['col1','col2','col3','col4','label']
category_cols = ['col2','col2']

LABEL = 'label'
TASK = 'CPU'
TYPE = 'classification' # [classification, regression, multiclass]
SPLIT = 0.3

In [None]:
data = pandas.read_csv(file_name, usecols=colnames)
data.head()

In [None]:

# convert categorical columns to integers
cat_dims = [data.columns.get_loc(i) for i in category_cols[:-1]] 
for header in category_cols:
    data[header] = data[header].astype('category').cat.codes

X = data.drop(LABEL, axis=1)
Y = data[LABEL]
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=SPLIT)

train_pool = cb.Pool(train_x, label=train_y, cat_features=category_cols)
test_pool = cb.Pool(test_x, label=test_y, cat_features=category_cols)

In [None]:
if TYPE == 'regression':
    loss = 'MAE'
    eval = ['RMSE', 'MAPE', 'MAE', 'R2']
elif TYPE == 'classification':
    loss = 'Logloss'
    eval = ['Logloss', 'CrossEntropy', 'Precision', 'Recall', 'F1', 'Accuracy', 'AUC']
elif TYPE == 'multiclass':
    loss = 'MultiClass'
    eval = ['MultiClass', 'MultiClassOneVsAll', 'Precision', 'Recall', 'F1', 'Accuracy', 'AUC']

params = {'depth':[3,1,2,6,4,5,7,8,9,10],
          'iterations':[50,100,250,500,1000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[3,1,5,10,50],
          'border_count':[32,5,10,20,50,100,200],
          'ctr_border_count':[50,5,10,20,100,200]
          }

In [None]:
model = cb.CatBoost({
            'thread_count':-1, 
            'task_type': TASK,
            'loss_function': loss,
            'eval_metric': eval
})

search_result = model.randomized_search(params, train_pool, n_iter=100, cv=3, plot=True)

In [None]:
search_result['cv_results']

In [None]:
best = search_result['params']
best

In [None]:
model.set_params(**best)
model.fit(train_pool, plot=True)

In [None]:
results = model.eval_metrics(test_pool, eval)
for i, r in zip(eval, results):
    print(e, r)