# IMPORTS

In [1]:
import sys
sys.path.insert(0, "../..")
import config as cfg
import gc
import os
from typing import Optional, Any, Union, Callable

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from metrics import compute_single_col_score, get_tresholds, compute_weird_pred_proba_score, compute_weird_pred_score
from sklearn.metrics import recall_score
from helper import make_prediction, check_path, get_prediction, evaluate, seed_everything, save_submission, save_pred_proba_oof, save_pred_proba_test
from collections import defaultdict

In [3]:
from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold, train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [4]:
import catboost as cb
import catboost.datasets as cbd
import catboost.utils as cbu

In [5]:
RANDOM_STATE = 77
seed_everything(RANDOM_STATE)

# MODEL TRAINING

In [6]:
train = pd.read_pickle(cfg.PREPARED_TRAIN_DATA_PATH)
test = pd.read_pickle(cfg.PREPARED_TEST_DATA_PATH)

In [7]:
X_train, Y_train = train.drop(cfg.TARGETS, axis=1), train[cfg.TARGETS]

In [8]:
EXPERIMENT_FAMILY_NAME = 'catboost'
EXPERIMENT_NAME = 'extcv'
N_SPLITS = 8

In [9]:
CAT_COLS = cfg.CAT_UNORDERED_COLS

In [10]:
def get_cat_pool(data: pd.DataFrame, labels: Optional[pd.DataFrame]=None, shuffle: bool=True, batch_size: int=32):
  return cb.Pool(
    data=data, 
    label=labels,
    cat_features=CAT_COLS)

In [11]:
def get_cat_model(train):
    return cb.CatBoostClassifier(
            loss_function='MultiLogloss',
            # custom_metric=['Recall', 'F1'],
            # eval_metric='F:beta=2',
            # grow_policy='Lossguide',
            iterations=2000,
            silent=True,
            depth=9,
            l2_leaf_reg=2.0,
            learning_rate=0.001,
            early_stopping_rounds=100,
            random_seed=RANDOM_STATE
        )

In [12]:
def fit_cat_model(train, val):
    model = get_cat_model(train)
    model.fit(train, eval_set=val, plot=False)
    return model

In [13]:
def save_cat_model(model: Any, experiment_name: str, experiment_family_name: str, fold: int, suffix='') -> None:
    if suffix:
        suffix = '_' + suffix

    model_name = f'{experiment_name}_fold_{fold}_rs_{RANDOM_STATE}{suffix}.cbm'
    model_path = os.path.join(cfg.MODELS_PATH, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
    check_path(model_path)
    model.save_model(os.path.join(model_path, model_name))

In [14]:
def predict_with_cat_model(model, data) -> np.ndarray:
    return model.predict_proba(data)

In [15]:
weird_scores = []

cv = MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

pred_int = pd.DataFrame(index=Y_train.index, columns=Y_train.columns, dtype=np.int32)
pred_proba = pd.DataFrame(index=Y_train.index, columns=Y_train.columns, dtype=np.float32)
for train_idx, val_idx in tqdm(cv.split(X_train, Y_train), total=N_SPLITS):

    pred_template = Y_train.iloc[val_idx].copy()
    pred_template.iloc[:, :] = 0
    
    prediction, pred_proba_oof, pred_proba_test = get_prediction(
        train_data=X_train.iloc[train_idx],
        train_labels=Y_train.iloc[train_idx],
        test_data=X_train.iloc[val_idx],
        pred_template=pred_template,
        process_input=get_cat_pool,
        save_model=save_cat_model,
        fit_model=fit_cat_model,
        predict=predict_with_cat_model,
        n_splits=3,
        random_state=RANDOM_STATE,
        experiment_name=EXPERIMENT_NAME,
        experiment_family_name=EXPERIMENT_FAMILY_NAME,
        suffix='eval',
        rename_cols=False,
        )
    
    pred_int.iloc[val_idx] = prediction
    pred_proba.iloc[val_idx] = pred_proba_test

metrics = evaluate(
    test_labels=Y_train,
    prediction=pred_int.astype(np.int32), 
    pred_proba_test=pred_proba
    )


  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[0.6533371327254306, 0.6579312924079714, 0.5960665790626369, 0.6789548778312824, 0.7246521789122686]
0.662188412187918 0.04159663822078797


  0%|          | 0/3 [00:00<?, ?it/s]

[0.6976253883710608, 0.6960183564114807, 0.6127631578947368, 0.6563412599082186, 0.7254825698645924]
0.6776461464900179 0.039214109721195406


  0%|          | 0/3 [00:00<?, ?it/s]

[0.6895964111343605, 0.6690875531658775, 0.6266035642465848, 0.6403142817410652, 0.7269086718524921]
0.670502096428076 0.03573984001396853


  0%|          | 0/3 [00:00<?, ?it/s]

[0.6375546889861138, 0.6784300350426664, 0.6040350877192983, 0.6539598108747045, 0.7110198789974072]
0.6569999003240381 0.036249244218387516


  0%|          | 0/3 [00:00<?, ?it/s]

[0.6928508105369808, 0.7026232836458834, 0.5903460359176522, 0.67125, 0.7323744935199288]
0.677888924724089 0.04798365648306646


  0%|          | 0/3 [00:00<?, ?it/s]

[0.6798062310030395, 0.6734327606988172, 0.6000000000000001, 0.6544444444444444, 0.7155263082272479]
0.6646419488747097 0.037892011038313994


  0%|          | 0/3 [00:00<?, ?it/s]

[0.6787297365754813, 0.6740663567879468, 0.6144809461235217, 0.6441666666666667, 0.7254455559388295]
0.6673778524184891 0.037100677611936826


  0%|          | 0/3 [00:00<?, ?it/s]

[0.6966818642350557, 0.7009231910356428, 0.6161629434954008, 0.6768055555555557, 0.7168046452799817]
0.6814756399203274 0.03505747601942794
[0, 0, 0, 0, 0]
0.0 0.0
TEST METRICS
defaultdict(<class 'list'>, {'weird_score': 0.0, 'oof_auc': 0.6898779035706086, 'oof_logloss': 1.0376437878403386})


In [21]:
print('OVERALL METRICS')
print(metrics)

OVERALL METRICS
defaultdict(<class 'list'>, {'weird_score': 0.6587770960259309, 'oof_auc': 0.6898779035706086, 'oof_logloss': 1.0376437878403386})


In [17]:
# [0.632011421731218, 0.6490860706862861, 0.661385903220677, 0.6271449441761077, 0.6818099910552275]
# 0.6502876661739032 0.01993929966464236

In [18]:
sample_submission = pd.read_csv(cfg.SAMPLE_SUBMISSION_PATH).set_index('ID')

prediction, pred_proba_oof, pred_proba_test = get_prediction(
    train_data=X_train,
    train_labels=Y_train,
    test_data=test,
    pred_template=sample_submission,
    process_input=get_cat_pool,
    save_model=save_cat_model,
    fit_model=fit_cat_model,
    predict=predict_with_cat_model,
    n_splits=N_SPLITS,
    random_state=RANDOM_STATE,
    experiment_name=EXPERIMENT_NAME,
    experiment_family_name=EXPERIMENT_FAMILY_NAME,
    suffix=''
    )

  0%|          | 0/8 [00:00<?, ?it/s]

[0.6823644256887854, 0.676993452053118, 0.6079963604249738, 0.6884773442920424, 0.7266424097192243]
0.6764947984356288 0.038432021836152276


In [19]:
save_submission(prediction, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
save_pred_proba_oof(pred_proba_oof, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
save_pred_proba_test(pred_proba_test, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)