In [1]:
import sys
sys.path.insert(0, "..")
import config as cfg

In [2]:
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm
from pycaret.classification import *
from metrics import compute_single_col_score, get_tresholds
from helper import make_prediction
from sklearn.model_selection import StratifiedKFold, train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [3]:
import catboost as cb
import catboost.datasets as cbd
import catboost.utils as cbu
import hyperopt

In [4]:
train = pd.read_pickle(cfg.PREPARED_TRAIN_DATA_PATH)
test = pd.read_pickle(cfg.PREPARED_TEST_DATA_PATH)

In [5]:
X_train, Y_train = train.drop(cfg.TARGETS, axis=1), train[cfg.TARGETS]

In [6]:
pred_proba_oof = pd.DataFrame(data=np.zeros(shape=(len(train), len(cfg.TARGETS))), index=train.index, columns=cfg.TARGETS)
pred_proba_test = pd.DataFrame(data=np.zeros(shape=(len(test), len(cfg.TARGETS))), index=test.index, columns=cfg.TARGETS)
metrics = {}

In [7]:
EXPERIMENT_FAMILY_NAME = 'catboost'
EXPERIMENT_NAME = 'baseline'
RANDOM_STATE = 77
N_SPLITS = 5

In [8]:
cv = MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

test_pool = cb.Pool(
        data=test,
        cat_features=cfg.CAT_UNORDERED_COLS)

fold = 0
for train_idx, val_idx in cv.split(X_train, Y_train):

    train_pool = cb.Pool(
        data=X_train.iloc[train_idx], 
        label=Y_train.iloc[train_idx],
        cat_features=cfg.CAT_UNORDERED_COLS)

    val_pool = cb.Pool(
        data=X_train.iloc[val_idx], 
        label=Y_train.iloc[val_idx],
        cat_features=cfg.CAT_UNORDERED_COLS)

    clf = cb.CatBoostClassifier(
        loss_function='MultiLogloss',
        custom_metric=['Recall', 'F1'],
        iterations=500,
    )

    clf.fit(train_pool, eval_set=val_pool, metric_period=10, plot=False, verbose=50)
    
    model_path = os.path.join(cfg.MODELS_PATH, EXPERIMENT_FAMILY_NAME, f'{EXPERIMENT_NAME}_fold_{fold}.cbm')
    clf.save_model(model_path)
    
    pred_proba_oof.iloc[val_idx, :] = clf.predict_proba(val_pool)
    pred_proba_test.iloc[:, :] += clf.predict_proba(test_pool)

    fold += 1

pred_proba_test /= N_SPLITS

        

Learning rate set to 0.040085
0:	learn: 0.6607723	test: 0.6607476	best: 0.6607476 (0)	total: 75ms	remaining: 37.4s
50:	learn: 0.3241376	test: 0.3418658	best: 0.3418658 (50)	total: 1.48s	remaining: 13s
100:	learn: 0.2801015	test: 0.3348544	best: 0.3347007 (90)	total: 3.83s	remaining: 15.1s
150:	learn: 0.2509167	test: 0.3358292	best: 0.3347007 (90)	total: 6.23s	remaining: 14.4s
200:	learn: 0.2138397	test: 0.3371404	best: 0.3347007 (90)	total: 8.87s	remaining: 13.2s
250:	learn: 0.1809710	test: 0.3384046	best: 0.3347007 (90)	total: 11.5s	remaining: 11.4s
300:	learn: 0.1569166	test: 0.3399848	best: 0.3347007 (90)	total: 14.1s	remaining: 9.33s
350:	learn: 0.1381803	test: 0.3447844	best: 0.3347007 (90)	total: 16.7s	remaining: 7.07s
400:	learn: 0.1226959	test: 0.3490493	best: 0.3347007 (90)	total: 19.2s	remaining: 4.74s
450:	learn: 0.1084380	test: 0.3528194	best: 0.3347007 (90)	total: 21.8s	remaining: 2.37s
499:	learn: 0.0973348	test: 0.3573515	best: 0.3347007 (90)	total: 24.3s	remaining: 0us


In [9]:
tresholds = get_tresholds(train[cfg.TARGETS], pred_proba_oof)
sample_submission = pd.read_csv(cfg.SAMPLE_SUBMISSION_PATH).set_index('ID')
submission = make_prediction(pred_proba_test, tresholds, sample_submission)

0.6834085155351219


In [None]:
# 0.6834085155351219

In [10]:
name = f'{EXPERIMENT_NAME}.csv'
submission.to_csv(os.path.join(cfg.SUBMISSION_PATH, EXPERIMENT_FAMILY_NAME, name))
pred_proba_oof.to_pickle(os.path.join(cfg.OOF_PRED_PATH, EXPERIMENT_FAMILY_NAME, name))
pred_proba_test.to_pickle(os.path.join(cfg.TEST_PRED_PATH, EXPERIMENT_FAMILY_NAME, name))