# IMPORTS

In [1]:
import sys
sys.path.insert(0, "../..")
import config as cfg
import gc
import os

In [2]:
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm
from metrics import compute_single_col_score, get_tresholds, compute_weird_pred_proba_score
from sklearn.metrics import recall_score
from helper import make_prediction, check_path
from sklearn.model_selection import StratifiedKFold, train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [3]:
import optuna
from optuna.integration import CatBoostPruningCallback

In [4]:
import catboost as cb
import catboost.datasets as cbd
import catboost.utils as cbu

# MODEL TRAINING

In [5]:
train = pd.read_pickle(cfg.PREPARED_TRAIN_DATA_PATH)
test = pd.read_pickle(cfg.PREPARED_TEST_DATA_PATH)

In [6]:
X_train, Y_train = train.drop(cfg.TARGETS, axis=1), train[cfg.TARGETS]

In [7]:
pred_proba_oof = pd.DataFrame(data=np.zeros(shape=(len(train), len(cfg.TARGETS))), index=train.index, columns=cfg.TARGETS)
pred_proba_test = pd.DataFrame(data=np.zeros(shape=(len(test), len(cfg.TARGETS))), index=test.index, columns=cfg.TARGETS)
metrics = {}

In [8]:
EXPERIMENT_FAMILY_NAME = 'catboost'
EXPERIMENT_NAME = 'depthwise_optuna'
RANDOM_STATE = 77
N_SPLITS = 5
N_RANDOM_SEEDS = 7

In [9]:
CAT_COLS = cfg.CAT_UNORDERED_COLS

In [10]:
train_x, valid_x, train_y, valid_y = train_test_split(X_train, Y_train, test_size=0.35, random_state=RANDOM_STATE, shuffle=True)

In [11]:
def objective(trial: optuna.Trial) -> float:

    param = {
        "objective": trial.suggest_categorical("objective", ["MultiLogloss", "MultiCrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.5, 10.0, log=False),
        "depth": trial.suggest_int("depth", 1, 12),
        # "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "cat_features": CAT_COLS,
        # "bootstrap_type": trial.suggest_categorical(
            # "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        # ),
        # "used_ram_limit": "3gb",
        "grow_policy": "Depthwise",
        "eval_metric": "MultiLogloss",
    }

    # if param["bootstrap_type"] == "Bayesian":
    #     param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    # elif param["bootstrap_type"] == "Bernoulli":
    #     param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=False)

    gbm = cb.CatBoostClassifier(**param)

    pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
    gbm.fit(
        train_x,
        train_y,
        eval_set=[(valid_x, valid_y)],
        verbose=0,
        early_stopping_rounds=100,
        callbacks=[pruning_callback],
    )

    # evoke pruning manually.
    pruning_callback.check_pruned()

    preds = gbm.predict_proba(valid_x)
    
    score = compute_weird_pred_proba_score(valid_y, preds, sub_std=True)
    return score

In [12]:
study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=25), direction="maximize"
    )
study.optimize(objective, n_trials=500, timeout=600)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-07-03 20:13:31,794][0m A new study created in memory with name: no-name-e2cf0a66-349d-492b-a97a-ff673cc811d8[0m


: 

: 

In [None]:
cv = MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

# CAT_COLS = cfg.CAT_UNORDERED_COLS

test_pool = cb.Pool(
        data=test,
        cat_features=CAT_COLS)


fold = 0
for train_idx, val_idx in tqdm(cv.split(X_train, Y_train), total=N_SPLITS):

    train_pool = cb.Pool(
        data=X_train.iloc[train_idx], 
        label=Y_train.iloc[train_idx],
        cat_features=CAT_COLS)

    val_pool = cb.Pool(
        data=X_train.iloc[val_idx], 
        label=Y_train.iloc[val_idx],
        cat_features=CAT_COLS)
        
    for random_seed in tqdm(range(N_RANDOM_SEEDS), total=N_RANDOM_SEEDS):

        clf = cb.CatBoostClassifier(
            **trial.params,
            n_estimators=2000,
            random_seed=random_seed,
            early_stopping_rounds=100,
            silent=True
        )

        clf.fit(train_pool, eval_set=val_pool, plot=False)
        
        model_name = f'{EXPERIMENT_NAME}_fold_{fold}_rs_{random_seed}.cbm'
        model_path = os.path.join(cfg.MODELS_PATH, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
        check_path(model_path)
        clf.save_model(os.path.join(model_path, model_name))
        
        pred_proba_oof.iloc[val_idx, :] += clf.predict_proba(val_pool)
        pred_proba_test.iloc[:, :] += clf.predict_proba(test_pool)
        del clf; gc.collect()
    del train_pool,val_pool; gc.collect() 
        
    fold += 1
pred_proba_oof /= N_RANDOM_SEEDS
pred_proba_test /= (N_SPLITS * N_RANDOM_SEEDS)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

# PREDICT AND SAVE PREDICTIONS

In [None]:
tresholds = get_tresholds(train[cfg.TARGETS], pred_proba_oof)
sample_submission = pd.read_csv(cfg.SAMPLE_SUBMISSION_PATH).set_index('ID')
submission = make_prediction(pred_proba_test, tresholds, sample_submission)

[0.7108556461001164, 0.720034473614426, 0.6076083174993978, 0.6554944761701447, 0.7346176887769036]
0.6857221204321977 0.047360922675579765


In [None]:
## BEST PARAMS
# 0.6880135426726085
# 0.6857221204321977

In [None]:
submission_path = os.path.join(cfg.SUBMISSION_PATH, EXPERIMENT_FAMILY_NAME)
check_path(submission_path)
submission.to_csv(os.path.join(submission_path, f'{EXPERIMENT_NAME}.csv'))

pred_proba_oof_path = os.path.join(cfg.OOF_PRED_PATH, EXPERIMENT_FAMILY_NAME)
check_path(pred_proba_oof_path)
pred_proba_oof.to_pickle(os.path.join(pred_proba_oof_path, f'{EXPERIMENT_NAME}.pkl'))

pred_proba_test_path = os.path.join(cfg.TEST_PRED_PATH, EXPERIMENT_FAMILY_NAME)
check_path(pred_proba_test_path)
pred_proba_test.to_pickle(os.path.join(pred_proba_test_path, f'{EXPERIMENT_NAME}.pkl'))