# IMPORTS

In [1]:
import sys
sys.path.insert(0, "../..")
import config as cfg
import gc
import os

In [2]:
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm
from metrics import compute_single_col_score, get_tresholds, compute_weird_pred_proba_score
from sklearn.metrics import recall_score
from helper import make_prediction, check_path
from sklearn.model_selection import StratifiedKFold, train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [3]:
import optuna
from optuna.integration import CatBoostPruningCallback

In [4]:
import catboost as cb
import catboost.datasets as cbd
import catboost.utils as cbu

# MODEL TRAINING

In [5]:
train = pd.read_pickle(cfg.PREPARED_TRAIN_DATA_PATH)
test = pd.read_pickle(cfg.PREPARED_TEST_DATA_PATH)

In [6]:
X_train, Y_train = train.drop(cfg.TARGETS, axis=1), train[cfg.TARGETS]

In [7]:
pred_proba_oof = pd.DataFrame(data=np.zeros(shape=(len(train), len(cfg.TARGETS))), index=train.index, columns=cfg.TARGETS)
pred_proba_test = pd.DataFrame(data=np.zeros(shape=(len(test), len(cfg.TARGETS))), index=test.index, columns=cfg.TARGETS)
metrics = {}

In [8]:
EXPERIMENT_FAMILY_NAME = 'catboost'
EXPERIMENT_NAME = 'lossguide_optuna'
RANDOM_STATE = 77
N_SPLITS = 5
N_RANDOM_SEEDS = 7

In [9]:
CAT_COLS = cfg.CAT_UNORDERED_COLS + cfg.BINARY_COLS

In [10]:
train_x, valid_x, train_y, valid_y = train_test_split(X_train, Y_train, test_size=0.35, random_state=RANDOM_STATE, shuffle=True)

In [11]:
def objective(trial: optuna.Trial) -> float:

    param = {
        "objective": trial.suggest_categorical("objective", ["MultiLogloss", "MultiCrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.5, 10.0, log=False),
        "depth": trial.suggest_int("depth", 1, 12),
        # "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        # "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"]),
        "cat_features": CAT_COLS,
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        # "used_ram_limit": "3gb",
        "eval_metric": "MultiLogloss",
        "grow_policy": "Lossguide"
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

    gbm = cb.CatBoostClassifier(**param)

    pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
    gbm.fit(
        train_x,
        train_y,
        eval_set=[(valid_x, valid_y)],
        verbose=0,
        early_stopping_rounds=100,
        callbacks=[pruning_callback],
    )

    # evoke pruning manually.
    pruning_callback.check_pruned()

    preds = gbm.predict_proba(valid_x)
    
    score = compute_weird_pred_proba_score(valid_y, preds, sub_std=False)
    return score

In [12]:
study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=25), direction="maximize"
    )
study.optimize(objective, n_trials=500, timeout=600)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-07-05 22:28:24,607][0m A new study created in memory with name: no-name-0f27287d-9961-4e43-a434-afa1854b090d[0m
  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:28:28,982][0m Trial 0 finished with value: 0.7249635229769813 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.09549767223063807, 'l2_leaf_reg': 7.025981383133219, 'depth': 9, 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.7249635229769813.[0m


[0.7561544991511036, 0.7392073281335027, 0.6409634055265123, 0.76875, 0.7197423820737886]
0.7249635229769813 0.04511385284841571


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:28:34,570][0m Trial 1 finished with value: 0.7118437537719771 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.04099679454536311, 'l2_leaf_reg': 2.3292097517164416, 'depth': 4, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 7.337954507584557}. Best is trial 0 with value: 0.7249635229769813.[0m


[0.7285123089983021, 0.7423816433883548, 0.5842668658202639, 0.7619791666666667, 0.7420787839862985]
0.7118437537719771 0.06467454283283498


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:28:38,902][0m Trial 2 finished with value: 0.7134131073697946 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.09803018946416092, 'l2_leaf_reg': 8.321012689209024, 'depth': 1, 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.7249635229769813.[0m


[0.7509019524617997, 0.7493651369490295, 0.5832710978342046, 0.74375, 0.7397773496039391]
0.7134131073697946 0.06519249175510536


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:28:43,720][0m Trial 3 finished with value: 0.7158962928501271 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.026489292663501918, 'l2_leaf_reg': 6.9603213222384595, 'depth': 12, 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.7249635229769813.[0m


[0.7393887945670627, 0.7628786504625431, 0.5926064227035102, 0.7479166666666667, 0.7366909298508528]
0.7158962928501271 0.06231749820372079


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:28:46,885][0m Trial 4 finished with value: 0.7170118931630738 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.06485585796212047, 'l2_leaf_reg': 2.498917744625331, 'depth': 9, 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.7249635229769813.[0m


[0.7383807300509337, 0.7542173045528751, 0.6168782673637043, 0.7510416666666666, 0.7245414971811889]
0.7170118931630738 0.05115140909833846


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:28:52,536][0m Trial 5 finished with value: 0.6935230085953378 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.012566213280057078, 'l2_leaf_reg': 4.1564170762267825, 'depth': 5, 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.7249635229769813.[0m


[0.7288306451612903, 0.7340830763649555, 0.5934154841921833, 0.6838541666666667, 0.7274316705915935]
0.6935230085953378 0.05321005281353419


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:28:57,500][0m Trial 6 finished with value: 0.7221503878085885 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.04563911079994494, 'l2_leaf_reg': 6.3952146577920885, 'depth': 6, 'bootstrap_type': 'Bernoulli', 'subsample': 0.29824522923347874}. Best is trial 0 with value: 0.7249635229769813.[0m


[0.7301570458404074, 0.7458280428079086, 0.6377271595718198, 0.7609375, 0.7361021908228074]
0.7221503878085885 0.04347668861474979


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:29:03,175][0m Trial 7 finished with value: 0.6960525374190609 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.01089155722123796, 'l2_leaf_reg': 2.0054630587449154, 'depth': 12, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 9.396811267208736}. Best is trial 0 with value: 0.7249635229769813.[0m


[0.70776740237691, 0.7559858516234355, 0.560555140652228, 0.7208333333333333, 0.7351209591093985]
0.6960525374190609 0.06961208826433571


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:29:08,392][0m Trial 8 finished with value: 0.6987148085419441 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.02205339220685345, 'l2_leaf_reg': 5.902230825237261, 'depth': 1, 'bootstrap_type': 'Bernoulli', 'subsample': 0.35635745572142946}. Best is trial 0 with value: 0.7249635229769813.[0m


[0.7301570458404074, 0.7323145292943951, 0.5765496639283048, 0.7005208333333334, 0.7540319703132805]
0.6987148085419441 0.06341155760810531


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:29:13,029][0m Trial 9 finished with value: 0.7280874770072142 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.04589118600175105, 'l2_leaf_reg': 0.5241052154683326, 'depth': 12, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7610887096774194, 0.746009432251043, 0.6081652974856858, 0.7958333333333334, 0.7293406122885893]
0.7280874770072142 0.06384775387764415


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:29:19,919][0m Trial 10 finished with value: 0.7029426381149542 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.018262457341102333, 'l2_leaf_reg': 0.5661234261836077, 'depth': 9, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.1246863627380521}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7285123089983021, 0.7543986939960095, 0.5644759770973363, 0.7192708333333333, 0.7480553771497895]
0.7029426381149542 0.07039363487077445


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:29:24,320][0m Trial 11 finished with value: 0.7138877678137978 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.09326339746315775, 'l2_leaf_reg': 9.860383182548404, 'depth': 9, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7597623089983021, 0.7277344458552513, 0.6043066965397063, 0.7484375, 0.7291978876757297]
0.7138877678137978 0.05609331326918566


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:29:28,563][0m Trial 12 finished with value: 0.7172872991913672 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.05711219836122009, 'l2_leaf_reg': 4.131945760050709, 'depth': 10, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7380623938879457, 0.7322238345728278, 0.5857605177993528, 0.7994791666666667, 0.7309105830300435]
0.7172872991913672 0.07056216523580114


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:29:33,007][0m Trial 13 finished with value: 0.7140412553202601 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.06833989815513815, 'l2_leaf_reg': 7.956938868804092, 'depth': 11, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.740025466893039, 0.7441501904589154, 0.5972740851381628, 0.7682291666666667, 0.7205273674445158]
0.7140412553202601 0.06032362496593188


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:29:38,395][0m Trial 14 finished with value: 0.7053810250198851 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.03514192384202295, 'l2_leaf_reg': 4.460208881736191, 'depth': 7, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7318017826825127, 0.7375294757845093, 0.5815907393577296, 0.7494791666666667, 0.7265039606080068]
0.7053810250198851 0.06236415420882475


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:29:43,618][0m Trial 15 finished with value: 0.7132300777305287 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.05480910023566885, 'l2_leaf_reg': 9.723073742706159, 'depth': 7, 'bootstrap_type': 'Bernoulli', 'subsample': 0.10483673023015085}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7173174872665535, 0.7424723381099221, 0.597460791635549, 0.7791666666666667, 0.7297331049739528]
0.7132300777305287 0.06147247836305792


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:29:48,199][0m Trial 16 finished with value: 0.7196381381168225 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.07557077213831007, 'l2_leaf_reg': 8.025064641727537, 'depth': 10, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7446413412563667, 0.7310901505532379, 0.6116504854368932, 0.7791666666666667, 0.7316420466709483]
0.7196381381168225 0.05675452654949666


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:29:52,202][0m Trial 17 finished with value: 0.7064958060484412 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.03112056478516732, 'l2_leaf_reg': 0.7829744598817547, 'depth': 8, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7199702886247878, 0.7629693451841103, 0.5640403286034354, 0.7505208333333333, 0.734978234496539]
0.7064958060484412 0.07268264870659195


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:29:56,622][0m Trial 18 finished with value: 0.7159034458543346 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.045057001710935976, 'l2_leaf_reg': 5.301143716992057, 'depth': 11, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.3600724233762156}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7413518675721562, 0.7579357881371305, 0.5982698531242221, 0.7494791666666667, 0.7324805537714979]
0.7159034458543346 0.05942075670885575


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:30:00,303][0m Trial 19 finished with value: 0.705361316549935 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.07864558673539358, 'l2_leaf_reg': 3.181989825151764, 'depth': 3, 'bootstrap_type': 'Bernoulli', 'subsample': 0.9764159389934741}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7314834465195246, 0.746009432251043, 0.5825242718446602, 0.7432291666666666, 0.72356026546778]
0.705361316549935 0.06194957683520052


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:30:07,740][0m Trial 20 finished with value: 0.6948157314884853 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.01654622543591899, 'l2_leaf_reg': 7.102720871721182, 'depth': 11, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7271859083191851, 0.7355795392708144, 0.5822753298481453, 0.6973958333333333, 0.7316420466709483]
0.6948157314884853 0.057857916985979335


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:30:13,152][0m Trial 21 finished with value: 0.7105081410348278 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.0477504715421272, 'l2_leaf_reg': 6.430891998640108, 'depth': 6, 'bootstrap_type': 'Bernoulli', 'subsample': 0.27914674464620526}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7298387096774194, 0.7321331398512607, 0.6087876524769729, 0.7333333333333334, 0.7484478698351531]
0.7105081410348278 0.05128114989976602


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:30:20,049][0m Trial 22 finished with value: 0.7045237486093836 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.03577117303464779, 'l2_leaf_reg': 5.344380241483151, 'depth': 6, 'bootstrap_type': 'Bernoulli', 'subsample': 0.29977434115411117}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7452780135823429, 0.7326773081806639, 0.5744336569579288, 0.7401041666666666, 0.7301255976593164]
0.7045237486093836 0.06526620227929658


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:30:32,063][0m Trial 23 finished with value: 0.7087394775594957 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.028773489278080253, 'l2_leaf_reg': 8.811749603650874, 'depth': 8, 'bootstrap_type': 'Bernoulli', 'subsample': 0.14535619989276857}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7222516977928692, 0.7490930527843279, 0.5642270351008215, 0.7619791666666667, 0.7461464354527938]
0.7087394775594957 0.07338744054603921


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:30:45,391][0m Trial 24 finished with value: 0.7157663971746062 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.05147915360075481, 'l2_leaf_reg': 6.855621444431592, 'depth': 5, 'bootstrap_type': 'Bernoulli', 'subsample': 0.572741956890634}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7222516977928692, 0.7476872846000363, 0.5906148867313916, 0.7729166666666667, 0.7453614500820667]
0.7157663971746062 0.06460019071793023


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:30:58,592][0m Trial 25 finished with value: 0.7120165389207787 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.04206779154099419, 'l2_leaf_reg': 6.195189088277855, 'depth': 8, 'bootstrap_type': 'Bernoulli', 'subsample': 0.18617641988793335}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7370543293718166, 0.7664157446036641, 0.5863206372915111, 0.7494791666666667, 0.7208128166702348]
0.7120165389207787 0.06460100184341731


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:31:14,181][0m Trial 26 finished with value: 0.7067695839806227 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.060115935672035964, 'l2_leaf_reg': 7.606383733904405, 'depth': 3, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7314834465195246, 0.7378015599492109, 0.6047423450336071, 0.728125, 0.7316955684007707]
0.7067695839806227 0.05110904912817753


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:31:28,484][0m Trial 27 finished with value: 0.7238013806282831 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.07930624153154599, 'l2_leaf_reg': 4.611263039702367, 'depth': 10, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7331281833616299, 0.7543079992744421, 0.6104057754543192, 0.7885416666666667, 0.7326232783843574]
0.7238013806282831 0.0602417410431817


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:31:41,095][0m Trial 28 finished with value: 0.7153019324969978 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.08186208232258402, 'l2_leaf_reg': 3.5157487161250787, 'depth': 12, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.733446519524618, 0.7559858516234355, 0.6136420214090117, 0.7338541666666667, 0.7395811032612574]
0.7153019324969978 0.051483605860493005


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:31:57,961][0m Trial 29 finished with value: 0.7095266222891249 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.08338618863789629, 'l2_leaf_reg': 1.3554984796123013, 'depth': 10, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.93309983041072}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7406621392190154, 0.7172138581534555, 0.5827109783420463, 0.7875000000000001, 0.7195461357311068]
0.7095266222891249 0.06825094847794339


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:32:13,419][0m Trial 30 finished with value: 0.7163769275102684 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.06875920950664231, 'l2_leaf_reg': 8.954671512096914, 'depth': 11, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7331281833616299, 0.7377108652276437, 0.5986432661189942, 0.7854166666666667, 0.7269856561764076]
0.7163769275102684 0.06241151688978101


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:32:27,611][0m Trial 31 finished with value: 0.7178779741327933 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.08981368133948063, 'l2_leaf_reg': 4.869466799611219, 'depth': 10, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7271859083191851, 0.7476872846000363, 0.6107791884490914, 0.7755208333333333, 0.7282166559623207]
0.7178779741327933 0.056347085253914735


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:32:35,740][0m Trial 32 finished with value: 0.707729444285324 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.04027163207021236, 'l2_leaf_reg': 5.77076585594307, 'depth': 5, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7311651103565365, 0.7338109922002539, 0.6120238984316655, 0.7291666666666666, 0.7324805537714979]
0.707729444285324 0.04787734540930732


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:32:55,288][0m Trial 33 finished with value: 0.7164519663269259 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.025368604549161917, 'l2_leaf_reg': 7.071364324570469, 'depth': 12, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.752546689303905, 0.7476872846000363, 0.585075927308937, 0.7635416666666667, 0.7334082637550845]
0.7164519663269259 0.06639743016566571


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:33:23,893][0m Trial 34 finished with value: 0.7100965916212919 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.03672392714448534, 'l2_leaf_reg': 3.204383331433324, 'depth': 9, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.740025466893039, 0.7477779793216035, 0.5929798356982823, 0.7328125, 0.7368871761935345]
0.7100965916212919 0.05876343588861214


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:34:01,446][0m Trial 35 finished with value: 0.7003513019788825 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.06509401969672893, 'l2_leaf_reg': 2.3162383234444683, 'depth': 8, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.0832663558477758}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7383807300509337, 0.7173952475965899, 0.5547672392332587, 0.759375, 0.7318382930136302]
0.7003513019788825 0.07403604573414059


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:34:39,232][0m Trial 36 finished with value: 0.7109643001438604 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.05169359568096364, 'l2_leaf_reg': 7.51191302102022, 'depth': 4, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7380623938879457, 0.7326773081806639, 0.5927931292008961, 0.7572916666666667, 0.73399700278313]
0.7109643001438604 0.05974431742753158


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:35:09,289][0m Trial 37 finished with value: 0.7063863282522791 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.07331790242919745, 'l2_leaf_reg': 1.511821726446521, 'depth': 7, 'bootstrap_type': 'Bernoulli', 'subsample': 0.5129346781722058}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7337648556876061, 0.746009432251043, 0.5954692556634305, 0.7265625, 0.7301255976593164]
0.7063863282522791 0.0558440721027347


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:35:37,479][0m Trial 38 finished with value: 0.719735849940038 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.09616671431415565, 'l2_leaf_reg': 6.570815671481412, 'depth': 12, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7413518675721562, 0.7476872846000363, 0.6087876524769729, 0.7682291666666667, 0.7326232783843574]
0.719735849940038 0.05670224866296587


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:36:24,132][0m Trial 39 finished with value: 0.7188638295127096 and parameters: {'objective': 'MultiCrossEntropy', 'colsample_bylevel': 0.05901863834857219, 'l2_leaf_reg': 5.833587650320574, 'depth': 11, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 9.878397702458066}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7459677419354839, 0.7272809722474152, 0.6041199900423201, 0.7744791666666666, 0.7424712766716621]
0.7188638295127096 0.059366651151897376


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:36:53,485][0m Trial 40 finished with value: 0.7213501085218377 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.09586040830060777, 'l2_leaf_reg': 4.682844880789226, 'depth': 9, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7564728353140917, 0.7474152004353347, 0.5942245456808564, 0.7854166666666667, 0.7232212945122386]
0.7213501085218377 0.06660619924744855


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:37:18,901][0m Trial 41 finished with value: 0.7187098234396967 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.09770102086353283, 'l2_leaf_reg': 3.82223309575038, 'depth': 9, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7498938879456706, 0.7477779793216035, 0.615260144386358, 0.7541666666666667, 0.7264504388781845]
0.7187098234396967 0.052604837269427676


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:37:46,910][0m Trial 42 finished with value: 0.7185485526419659 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.08512366891009664, 'l2_leaf_reg': 4.72943981371934, 'depth': 10, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7429966044142615, 0.7374387810629421, 0.603746577047548, 0.7791666666666667, 0.7293941340184115]
0.7185485526419659 0.059876913662852665


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:38:16,050][0m Trial 43 finished with value: 0.7182591126672893 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.07044092347761914, 'l2_leaf_reg': 2.6112646916948603, 'depth': 9, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7403438030560272, 0.7510429892980228, 0.563480209111277, 0.8088541666666667, 0.727574395204453]
0.7182591126672893 0.08223534933397461


  pruning_callback = CatBoostPruningCallback(trial, "MultiLogloss")
[32m[I 2022-07-05 22:38:44,236][0m Trial 44 finished with value: 0.7172610165944183 and parameters: {'objective': 'MultiLogloss', 'colsample_bylevel': 0.09998401599410277, 'l2_leaf_reg': 5.255122538833026, 'depth': 6, 'bootstrap_type': 'MVS'}. Best is trial 9 with value: 0.7280874770072142.[0m


[0.7255411714770799, 0.7374387810629421, 0.6057381130196664, 0.784375, 0.7332120174124028]
0.7172610165944183 0.05945045392800889
Number of finished trials: 45
Best trial:
  Value: 0.7280874770072142
  Params: 
    objective: MultiCrossEntropy
    colsample_bylevel: 0.04589118600175105
    l2_leaf_reg: 0.5241052154683326
    depth: 12
    bootstrap_type: MVS


In [13]:
cv = MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

# CAT_COLS = cfg.CAT_UNORDERED_COLS

test_pool = cb.Pool(
        data=test,
        cat_features=CAT_COLS)


fold = 0
for train_idx, val_idx in tqdm(cv.split(X_train, Y_train), total=N_SPLITS):

    train_pool = cb.Pool(
        data=X_train.iloc[train_idx], 
        label=Y_train.iloc[train_idx],
        cat_features=CAT_COLS)

    val_pool = cb.Pool(
        data=X_train.iloc[val_idx], 
        label=Y_train.iloc[val_idx],
        cat_features=CAT_COLS)
        
    for random_seed in tqdm(range(N_RANDOM_SEEDS), total=N_RANDOM_SEEDS):

        clf = cb.CatBoostClassifier(
            **trial.params,
            n_estimators=2000,
            random_seed=random_seed,
            early_stopping_rounds=100,
            silent=True
        )

        clf.fit(train_pool, eval_set=val_pool, plot=False)
        
        model_name = f'{EXPERIMENT_NAME}_fold_{fold}_rs_{random_seed}.cbm'
        model_path = os.path.join(cfg.MODELS_PATH, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
        check_path(model_path)
        clf.save_model(os.path.join(model_path, model_name))
        
        pred_proba_oof.iloc[val_idx, :] += clf.predict_proba(val_pool)
        pred_proba_test.iloc[:, :] += clf.predict_proba(test_pool)
        del clf; gc.collect()
    del train_pool,val_pool; gc.collect() 
        
    fold += 1
pred_proba_oof /= N_RANDOM_SEEDS
pred_proba_test /= (N_SPLITS * N_RANDOM_SEEDS)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

# PREDICT AND SAVE PREDICTIONS

In [14]:
tresholds = get_tresholds(train[cfg.TARGETS], pred_proba_oof)
sample_submission = pd.read_csv(cfg.SAMPLE_SUBMISSION_PATH).set_index('ID')
submission = make_prediction(pred_proba_test, tresholds, sample_submission)

[0.7024641055490881, 0.7098759765824205, 0.6184400674391843, 0.6525991354005444, 0.7367100707445355]
0.6840178711431546 0.042598024305826765


In [15]:
## BEST PARAMS
# 0.6882056101133809 

In [16]:
submission_path = os.path.join(cfg.SUBMISSION_PATH, EXPERIMENT_FAMILY_NAME)
check_path(submission_path)
submission.to_csv(os.path.join(submission_path, f'{EXPERIMENT_NAME}.csv'))

pred_proba_oof_path = os.path.join(cfg.OOF_PRED_PATH, EXPERIMENT_FAMILY_NAME)
check_path(pred_proba_oof_path)
pred_proba_oof.to_pickle(os.path.join(pred_proba_oof_path, f'{EXPERIMENT_NAME}.pkl'))

pred_proba_test_path = os.path.join(cfg.TEST_PRED_PATH, EXPERIMENT_FAMILY_NAME)
check_path(pred_proba_test_path)
pred_proba_test.to_pickle(os.path.join(pred_proba_test_path, f'{EXPERIMENT_NAME}.pkl'))