# IMPORTS

In [1]:
import sys
sys.path.insert(0, "../..")
import config as cfg
import gc
import os
from typing import Optional, Any, Union, Callable

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from metrics import compute_single_col_score, get_tresholds, compute_weird_pred_proba_score, compute_weird_pred_score
from sklearn.metrics import recall_score
from helper import make_prediction, check_path, get_prediction, evaluate, seed_everything, save_submission, save_pred_proba_oof, save_pred_proba_test
from collections import defaultdict

In [3]:
from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold, train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [4]:
import catboost as cb
import catboost.datasets as cbd
import catboost.utils as cbu

In [5]:
RANDOM_STATE = 77
seed_everything(RANDOM_STATE)

# MODEL TRAINING

In [6]:
train = pd.read_pickle(cfg.PREPARED_TRAIN_DATA_PATH)
test = pd.read_pickle(cfg.PREPARED_TEST_DATA_PATH)

In [7]:
X_train, Y_train = train.drop(cfg.TARGETS, axis=1), train[cfg.TARGETS]

In [8]:
EXPERIMENT_FAMILY_NAME = 'catboost'
EXPERIMENT_NAME = 'extcv'
N_SPLITS = 7

In [9]:
CAT_COLS = cfg.CAT_UNORDERED_COLS

In [10]:
def get_cat_pool(data: pd.DataFrame, labels: Optional[pd.DataFrame]=None, shuffle: bool=True, batch_size: int=32):
  return cb.Pool(
    data=data, 
    label=labels,
    cat_features=CAT_COLS)

In [11]:
def get_cat_model(train, iterations: int = None):
    return cb.CatBoostClassifier(
            loss_function='MultiLogloss',
            # custom_metric=['Recall', 'F1'],
            # eval_metric='F:beta=2',
            # grow_policy='Lossguide',
            iterations=2000 if iterations is None else iterations,
            early_stopping_rounds=100 if iterations is None else None,
            silent=True,
            depth=3,
            l2_leaf_reg=2.0,
            learning_rate=0.001,
            random_seed=RANDOM_STATE
        )

In [12]:
def fit_cat_model(train, val, iterations: int = None):
    model = get_cat_model(train, iterations)
    model.fit(train, eval_set=val, plot=False)
    best_iter = model.get_best_iteration()
    return model, best_iter

In [13]:
def save_cat_model(model: Any, experiment_name: str, experiment_family_name: str, fold: int, suffix='') -> None:
    if suffix:
        suffix = '_' + suffix

    model_name = f'{experiment_name}_fold_{fold}_rs_{RANDOM_STATE}{suffix}.cbm'
    model_path = os.path.join(cfg.MODELS_PATH, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
    check_path(model_path)
    model.save_model(os.path.join(model_path, model_name))

In [14]:
def predict_with_cat_model(model, data) -> np.ndarray:
    return model.predict_proba(data)

In [15]:
weird_scores = []

cv = MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

pred_int = pd.DataFrame(index=Y_train.index, columns=Y_train.columns, dtype=np.int32)
pred_proba = pd.DataFrame(index=Y_train.index, columns=Y_train.columns, dtype=np.float32)
best_iters_global = []
for train_idx, val_idx in tqdm(cv.split(X_train, Y_train), total=N_SPLITS):

    pred_template = Y_train.iloc[val_idx].copy()
    pred_template.iloc[:, :] = 0
    
    prediction, pred_proba_oof, pred_proba_test, best_iters = get_prediction(
        train_data=X_train.iloc[train_idx],
        train_labels=Y_train.iloc[train_idx],
        test_data=X_train.iloc[val_idx],
        pred_template=pred_template,
        process_input=get_cat_pool,
        save_model=save_cat_model,
        fit_model=fit_cat_model,
        predict=predict_with_cat_model,
        n_splits=3,
        random_state=RANDOM_STATE,
        experiment_name=EXPERIMENT_NAME,
        experiment_family_name=EXPERIMENT_FAMILY_NAME,
        suffix='eval',
        rename_cols=False,
        )
    
    pred_int.iloc[val_idx] = prediction
    pred_proba.iloc[val_idx] = pred_proba_test
    best_iters_global.extend(best_iters)

metrics = evaluate(
    test_labels=Y_train,
    prediction=pred_int.astype(np.int32), 
    pred_proba_test=pred_proba
    )


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[0.6777405885471899, 0.6839275766016712, 0.6138113920371985, 0.6926290822842547, 0.7280549017724194]
0.6792327082485468 0.037060184140580384


  0%|          | 0/3 [00:00<?, ?it/s]

[0.6932867338920901, 0.6844438622135194, 0.6079539270814438, 0.7020408163265306, 0.7246179366182002]
0.6824686552263568 0.03958031932956167


  0%|          | 0/3 [00:00<?, ?it/s]

[0.6821470665269775, 0.6900834492350487, 0.598022855069835, 0.6686224489795918, 0.7263276403094834]
0.6730406920241873 0.04209531380190474


  0%|          | 0/3 [00:00<?, ?it/s]

[0.6452576364298243, 0.6690890125173853, 0.5936409710235411, 0.6677295918367346, 0.7223992715684042]
0.6596232966751779 0.04160811054140829


  0%|          | 0/3 [00:00<?, ?it/s]

[0.6884444591728527, 0.690974930362117, 0.6163280316263675, 0.6660463419084108, 0.7184603006868726]
0.6760508127513242 0.03418127103307966


  0%|          | 0/3 [00:00<?, ?it/s]

[0.7118095773902109, 0.6848539638386648, 0.6297478686740432, 0.6682397959183674, 0.7194070710580229]
0.6828116553758619 0.032287395376966364


  0%|          | 0/3 [00:00<?, ?it/s]

[0.6883947507953341, 0.6921993454575583, 0.6049658529497239, 0.6906081273088945, 0.7197692012104329]
0.6791874555443889 0.03883359288432362
[0.6653630675203726, 0.6625614507476083, 0.5967899483501485, 0.6593237978331643, 0.7139779925467151]
0.6596032513996016 0.03726695059760448
TEST METRICS
defaultdict(<class 'list'>, {'weird_score': 0.6596032513996016, 'oof_auc': 0.6875309046672902, 'oof_logloss': 1.0399101685479049})


In [16]:
print('OVERALL METRICS')
print(metrics)
print(best_iters_global)

OVERALL METRICS
defaultdict(<class 'list'>, {'weird_score': 0.6596032513996016, 'oof_auc': 0.6875309046672902, 'oof_logloss': 1.0399101685479049})
[1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1998, 1999, 1999, 1999, 1999]


In [17]:
N_ITERS = int(np.mean(best_iters_global))

In [18]:
# [0.632011421731218, 0.6490860706862861, 0.661385903220677, 0.6271449441761077, 0.6818099910552275]
# 0.6502876661739032 0.01993929966464236

In [19]:
from functools import partial


sample_submission = pd.read_csv(cfg.SAMPLE_SUBMISSION_PATH).set_index('ID')

prediction, pred_proba_oof, pred_proba_test, _ = get_prediction(
    train_data=X_train,
    train_labels=Y_train,
    test_data=test,
    pred_template=sample_submission,
    process_input=get_cat_pool,
    save_model=save_cat_model,
    fit_model=partial(fit_cat_model, iterations=N_ITERS),
    predict=predict_with_cat_model,
    n_splits=N_SPLITS,
    random_state=RANDOM_STATE,
    experiment_name=EXPERIMENT_NAME,
    experiment_family_name=EXPERIMENT_FAMILY_NAME,
    suffix=''
    )

  0%|          | 0/7 [00:00<?, ?it/s]

[0.685456684128832, 0.6883146686249312, 0.6084178553268927, 0.6825265517425415, 0.718023998519915]
0.6765479516686226 0.0363745440954669


In [20]:
save_submission(prediction, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
save_pred_proba_oof(pred_proba_oof, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
save_pred_proba_test(pred_proba_test, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)