In [1]:
import sys
sys.path.insert(0, "..")
import config as cfg

In [2]:
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm
from pycaret.classification import *
from metrics import compute_single_col_score, get_tresholds
from helper import make_prediction
from sklearn.model_selection import StratifiedKFold, train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [3]:
import catboost as cb
import catboost.datasets as cbd
import catboost.utils as cbu
import hyperopt

In [4]:
train = pd.read_pickle(cfg.PREPARED_TRAIN_DATA_PATH)
test = pd.read_pickle(cfg.PREPARED_TEST_DATA_PATH)

# FEATURE GENERATION

In [5]:
def generate_cat_col_pairs():
    n = len(cfg.CAT_UNORDERED_COLS)
    for i in range(n):
        c1 = cfg.CAT_UNORDERED_COLS[i]
        for j in range(i+1, n):
            c2 = cfg.CAT_UNORDERED_COLS[j]
            yield c1, c2, f'{c1}_{c2}'

In [6]:
def add_paired_categories(data: pd.DataFrame) -> pd.DataFrame:
    for c1, c2, paired_col in generate_cat_col_pairs():
        data[paired_col] = data[c1].astype('str') + ' | ' + data[c2].astype('str')
        data[paired_col] = data[paired_col].astype('category')
    return data

In [7]:
# train = add_paired_categories(train)
# test = add_paired_categories(test)

In [8]:
# BI_UNORDERED_CAT_COLS = [paired_col for c1, c2, paired_col in generate_cat_col_pairs()]

In [9]:
X_train, Y_train = train.drop(cfg.TARGETS, axis=1), train[cfg.TARGETS]

In [10]:
pred_proba_oof = pd.DataFrame(data=np.zeros(shape=(len(train), len(cfg.TARGETS))), index=train.index, columns=cfg.TARGETS)
pred_proba_test = pd.DataFrame(data=np.zeros(shape=(len(test), len(cfg.TARGETS))), index=test.index, columns=cfg.TARGETS)
metrics = {}

In [11]:
EXPERIMENT_FAMILY_NAME = 'catboost'
EXPERIMENT_NAME = 'baseline'
RANDOM_STATE = 77
N_SPLITS = 5

In [12]:
cv = MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

test_pool = cb.Pool(
        data=test,
        cat_features=cfg.CAT_UNORDERED_COLS)

fold = 0
for train_idx, val_idx in cv.split(X_train, Y_train):

    train_pool = cb.Pool(
        data=X_train.iloc[train_idx], 
        label=Y_train.iloc[train_idx],
        cat_features=cfg.CAT_UNORDERED_COLS)

    val_pool = cb.Pool(
        data=X_train.iloc[val_idx], 
        label=Y_train.iloc[val_idx],
        cat_features=cfg.CAT_UNORDERED_COLS)

    clf = cb.CatBoostClassifier(
        loss_function='MultiLogloss',
        custom_metric=['Recall', 'F1'],
        iterations=1000,
        silent=True,
        depth=6,
        l2_leaf_reg=2.0,
        learning_rate=0.01,
        early_stopping_rounds=100
    )

    clf.fit(train_pool, eval_set=val_pool, metric_period=10, plot=False)
    
    model_path = os.path.join(cfg.MODELS_PATH, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME, f'{EXPERIMENT_NAME}_fold_{fold}.cbm')
    clf.save_model(model_path)
    
    pred_proba_oof.iloc[val_idx, :] = clf.predict_proba(val_pool)
    pred_proba_test.iloc[:, :] += clf.predict_proba(test_pool)

    fold += 1

pred_proba_test /= N_SPLITS



In [13]:
tresholds = get_tresholds(train[cfg.TARGETS], pred_proba_oof)
sample_submission = pd.read_csv(cfg.SAMPLE_SUBMISSION_PATH).set_index('ID')
submission = make_prediction(pred_proba_test, tresholds, sample_submission)

0.6855817004516904


In [14]:
# 0.6855817004516904 - without generated cat features

# iterations=1000,
# silent=True,
# depth=6,
# l2_leaf_reg=2.0,
# learning_rate=0.01,
# early_stopping_rounds=100

In [15]:
name = f'{EXPERIMENT_NAME}.csv'
submission.to_csv(os.path.join(cfg.SUBMISSION_PATH, EXPERIMENT_FAMILY_NAME, name))
pred_proba_oof.to_pickle(os.path.join(cfg.OOF_PRED_PATH, EXPERIMENT_FAMILY_NAME, name))
pred_proba_test.to_pickle(os.path.join(cfg.TEST_PRED_PATH, EXPERIMENT_FAMILY_NAME, name))