# IMPORTS

In [1]:
import sys
sys.path.insert(0, "../..")
import config as cfg
import gc
import os

In [2]:
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm
from metrics import compute_single_col_score, get_tresholds, compute_weird_pred_proba_score
from sklearn.metrics import recall_score
from helper import make_prediction, check_path
from sklearn.model_selection import StratifiedKFold, train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [3]:
import optuna
from optuna.integration import CatBoostPruningCallback

In [4]:
import catboost as cb
import catboost.datasets as cbd
import catboost.utils as cbu

# MODEL TRAINING

In [5]:
train = pd.read_pickle(cfg.PREPARED_TRAIN_DATA_PATH)
test = pd.read_pickle(cfg.PREPARED_TEST_DATA_PATH)

In [6]:
X_train, Y_train = train.drop(cfg.TARGETS, axis=1), train[cfg.TARGETS]

In [7]:
pred_proba_oof = pd.DataFrame(data=np.zeros(shape=(len(train), len(cfg.TARGETS))), index=train.index, columns=cfg.TARGETS)
pred_proba_test = pd.DataFrame(data=np.zeros(shape=(len(test), len(cfg.TARGETS))), index=test.index, columns=cfg.TARGETS)
metrics = {}

In [8]:
EXPERIMENT_FAMILY_NAME = 'catboost'
EXPERIMENT_NAME = 'baseline_optuna'
RANDOM_STATE = 77
N_SPLITS = 5
N_RANDOM_SEEDS = 7

In [9]:
CAT_COLS = cfg.CAT_UNORDERED_COLS + cfg.BINARY_COLS

In [10]:
train_x, valid_x, train_y, valid_y = train_test_split(X_train, Y_train, test_size=0.35, random_state=RANDOM_STATE, shuffle=True)

In [11]:
def objective(trial: optuna.Trial) -> float:

    param = {
        "objective": trial.suggest_categorical("objective", ["MultiLogloss", "MultiCrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.5, 10.0, log=False),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        # "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"]),
        "cat_features": CAT_COLS,
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        # "used_ram_limit": "3gb",
        "eval_metric": "MultiLogloss",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

    gbm = cb.CatBoostClassifier(**param)

    pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
    gbm.fit(
        train_x,
        train_y,
        eval_set=[(valid_x, valid_y)],
        verbose=0,
        early_stopping_rounds=100,
        callbacks=[pruning_callback],
    )

    # evoke pruning manually.
    pruning_callback.check_pruned()

    preds = gbm.predict_proba(valid_x)
    
    score = compute_weird_pred_proba_score(valid_y, preds, sub_std=False)
    return score

In [12]:
study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=25), direction="maximize"
    )
study.optimize(objective, n_trials=500, timeout=600)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-07-05 22:30:30,978][0m A new study created in memory with name: no-name-9efaf213-19ad-4cdb-875a-47016788860e[0m
  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:30:45,486][0m Trial 0 finished with value: 0.699271641685009 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.0310012036639174, 'l2_leaf_reg': 6.279216967487989, 'depth': 6, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.3664815640526435}. Best is trial 0 with value: 0.699271641685009.[0m


[0.7186438879456706, 0.7510429892980228, 0.5926064227035102, 0.6994791666666667, 0.7345857418111754]
0.699271641685009 0.05600027505925528


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:30:57,441][0m Trial 1 finished with value: 0.7040503979233936 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.05141636576278381, 'l2_leaf_reg': 3.918983203905896, 'depth': 5, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.2272536675785576}. Best is trial 1 with value: 0.7040503979233936.[0m


[0.7386990662139219, 0.7575730092508617, 0.5776699029126213, 0.7192708333333333, 0.72703917790623]
0.7040503979233936 0.06449399245814065


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:31:13,805][0m Trial 2 finished with value: 0.7088401592713843 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.040270597113539995, 'l2_leaf_reg': 8.933759613466608, 'depth': 2, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 2 with value: 0.7088401592713843.[0m


[0.7397071307300509, 0.7375294757845093, 0.5779810804082649, 0.7432291666666666, 0.7457539427674302]
0.7088401592713843 0.06549072693374268


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:31:28,477][0m Trial 3 finished with value: 0.7172674453252093 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.06387018504495565, 'l2_leaf_reg': 3.774589645137141, 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 3 with value: 0.7172674453252093.[0m


[0.7268675721561969, 0.7428351169961909, 0.5992656211102813, 0.7864583333333334, 0.7309105830300435]
0.7172674453252093 0.06267984201907784


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:31:50,385][0m Trial 4 finished with value: 0.7008705866837251 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.014760073807796035, 'l2_leaf_reg': 6.961532017126388, 'depth': 2, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.3588257096554078}. Best is trial 3 with value: 0.7172674453252093.[0m


[0.7176358234295416, 0.7392073281335027, 0.5664675130694548, 0.7348958333333333, 0.7461464354527938]
0.7008705866837251 0.06785659142105444


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:32:18,033][0m Trial 5 finished with value: 0.707254218816135 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.023338869055540997, 'l2_leaf_reg': 5.736980369995456, 'depth': 5, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 3 with value: 0.7172674453252093.[0m


[0.7495755517826825, 0.7271902775258481, 0.6006970375902415, 0.7234375, 0.7353707271819026]
0.707254218816135 0.05402853165561458


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:32:30,848][0m Trial 6 finished with value: 0.7130873431438298 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.019402822868103905, 'l2_leaf_reg': 8.335899635249765, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 3 with value: 0.7172674453252093.[0m


[0.7416702037351444, 0.7461001269726102, 0.6346776201145133, 0.7161458333333334, 0.7268429315635481]
0.7130873431438298 0.040628755964844415


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:32:42,275][0m Trial 7 finished with value: 0.6978902221529747 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.021589682254820497, 'l2_leaf_reg': 1.889292808382245, 'depth': 11, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.7801758722098311}. Best is trial 3 with value: 0.7172674453252093.[0m


[0.7153544142614601, 0.7408851804824959, 0.5700771720189196, 0.7328125, 0.7303218440019981]
0.6978902221529747 0.06443774641949035


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:33:09,877][0m Trial 8 finished with value: 0.7041520785067 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.023842986479175265, 'l2_leaf_reg': 2.2029888271191087, 'depth': 3, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.22067815519518846}. Best is trial 3 with value: 0.7172674453252093.[0m


[0.7275042444821731, 0.7475965898784691, 0.5810306198655713, 0.7348958333333333, 0.7297331049739528]
0.7041520785067 0.061953547401186686


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:34:00,643][0m Trial 9 finished with value: 0.7083227116420219 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.04198968427998903, 'l2_leaf_reg': 9.670735672817248, 'depth': 1, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 3 with value: 0.7172674453252093.[0m


[0.7265492359932089, 0.7478686740431707, 0.5913617127209361, 0.7296875, 0.7461464354527938]
0.7083227116420219 0.059098312776712125


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:35:08,760][0m Trial 10 finished with value: 0.6968209826800127 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.09165574707426973, 'l2_leaf_reg': 3.844277067316892, 'depth': 10, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.9094796201978186}. Best is trial 3 with value: 0.7172674453252093.[0m


[0.7469227504244482, 0.6986667875929622, 0.6107791884490914, 0.7109375, 0.7167986869335616]
0.6968209826800127 0.04585884693242515


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:35:46,020][0m Trial 11 finished with value: 0.6944183213041416 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.010124766582498508, 'l2_leaf_reg': 7.818684522422149, 'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 3 with value: 0.7172674453252093.[0m


[0.7360993208828523, 0.7372573916198077, 0.5636046801095345, 0.7067708333333333, 0.7283593805751802]
0.6944183213041416 0.06631633351121573


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:36:15,418][0m Trial 12 finished with value: 0.718427276361126 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.07741827729746069, 'l2_leaf_reg': 4.34762224033762, 'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 12 with value: 0.718427276361126.[0m


[0.7597623089983021, 0.7289588245964085, 0.6245954692556634, 0.7494791666666667, 0.7293406122885893]
0.718427276361126 0.04838848468104415


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:37:00,834][0m Trial 13 finished with value: 0.6874565357412079 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.08968670930618243, 'l2_leaf_reg': 4.123863141676485, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 9.183938741256625}. Best is trial 12 with value: 0.718427276361126.[0m


[0.7176358234295416, 0.7157173952475966, 0.6109658949464775, 0.6838541666666667, 0.7091093984157568]
0.6874565357412079 0.04010512994507215


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:39:03,233][0m Trial 14 finished with value: 0.7141237905048345 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.06500541343140498, 'l2_leaf_reg': 4.763191184213861, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 12 with value: 0.718427276361126.[0m


[0.7288306451612903, 0.7360330128786505, 0.6440751804829474, 0.7421875, 0.7194926140012845]
0.7141237905048345 0.03583157663646669


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:39:32,005][0m Trial 15 finished with value: 0.7067770182288264 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.0648931812599479, 'l2_leaf_reg': 2.9490274178923794, 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 12 with value: 0.718427276361126.[0m


[0.7288306451612903, 0.7424723381099221, 0.6251555887478217, 0.7140625, 0.7233640191250981]
0.7067770182288264 0.04183487281248427


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:39:56,189][0m Trial 16 finished with value: 0.6927351616759173 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.07100705192097594, 'l2_leaf_reg': 1.0620414239888065, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 12 with value: 0.718427276361126.[0m


[0.7018781833616299, 0.7357609287139488, 0.6045556385362211, 0.7119791666666666, 0.7095018911011204]
0.6927351616759173 0.04552338897666242


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:40:37,836][0m Trial 17 finished with value: 0.6979581527955535 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.05035480333064966, 'l2_leaf_reg': 5.174948797562265, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.418775871307282}. Best is trial 12 with value: 0.718427276361126.[0m


[0.7466044142614601, 0.7005260293850898, 0.6007592730893702, 0.7135416666666667, 0.7283593805751802]
0.6979581527955535 0.05096617607002798
Number of finished trials: 18
Best trial:
  Value: 0.718427276361126
  Params: 
    objective: MultiLogloss
    colsample_bylevel: 0.07741827729746069
    l2_leaf_reg: 4.34762224033762
    depth: 8
    boosting_type: Plain
    bootstrap_type: MVS


In [13]:
cv = MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

# CAT_COLS = cfg.CAT_UNORDERED_COLS

test_pool = cb.Pool(
        data=test,
        cat_features=CAT_COLS)


fold = 0
for train_idx, val_idx in tqdm(cv.split(X_train, Y_train), total=N_SPLITS):

    train_pool = cb.Pool(
        data=X_train.iloc[train_idx], 
        label=Y_train.iloc[train_idx],
        cat_features=CAT_COLS)

    val_pool = cb.Pool(
        data=X_train.iloc[val_idx], 
        label=Y_train.iloc[val_idx],
        cat_features=CAT_COLS)
        
    for random_seed in tqdm(range(N_RANDOM_SEEDS), total=N_RANDOM_SEEDS):

        clf = cb.CatBoostClassifier(
            **trial.params,
            n_estimators=2000,
            random_seed=random_seed,
            early_stopping_rounds=100,
            silent=True
        )

        clf.fit(train_pool, eval_set=val_pool, plot=False)
        
        model_name = f'{EXPERIMENT_NAME}_fold_{fold}_rs_{random_seed}.cbm'
        model_path = os.path.join(cfg.MODELS_PATH, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
        check_path(model_path)
        clf.save_model(os.path.join(model_path, model_name))
        
        pred_proba_oof.iloc[val_idx, :] += clf.predict_proba(val_pool)
        pred_proba_test.iloc[:, :] += clf.predict_proba(test_pool)
        del clf; gc.collect()
    del train_pool,val_pool; gc.collect() 
        
    fold += 1
pred_proba_oof /= N_RANDOM_SEEDS
pred_proba_test /= (N_SPLITS * N_RANDOM_SEEDS)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

# PREDICT AND SAVE PREDICTIONS

In [14]:
tresholds = get_tresholds(train[cfg.TARGETS], pred_proba_oof)
sample_submission = pd.read_csv(cfg.SAMPLE_SUBMISSION_PATH).set_index('ID')
submission = make_prediction(pred_proba_test, tresholds, sample_submission)

[0.7088062669771051, 0.7201466658507232, 0.6145061150212755, 0.6634733415167849, 0.737991930013127]
0.6889848638758032 0.04463980344631226


In [15]:
## BEST PARAMS
# [0.7088062669771051, 0.7201466658507232, 0.6145061150212755, 0.6634733415167849, 0.737991930013127]
# 0.6889848638758032 0.04463980344631226

In [16]:
submission_path = os.path.join(cfg.SUBMISSION_PATH, EXPERIMENT_FAMILY_NAME)
check_path(submission_path)
submission.to_csv(os.path.join(submission_path, f'{EXPERIMENT_NAME}.csv'))

pred_proba_oof_path = os.path.join(cfg.OOF_PRED_PATH, EXPERIMENT_FAMILY_NAME)
check_path(pred_proba_oof_path)
pred_proba_oof.to_pickle(os.path.join(pred_proba_oof_path, f'{EXPERIMENT_NAME}.pkl'))

pred_proba_test_path = os.path.join(cfg.TEST_PRED_PATH, EXPERIMENT_FAMILY_NAME)
check_path(pred_proba_test_path)
pred_proba_test.to_pickle(os.path.join(pred_proba_test_path, f'{EXPERIMENT_NAME}.pkl'))