# IMPORTS

In [1]:
import sys
sys.path.insert(0, "../..")
import config as cfg
import gc

In [2]:
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm
from metrics import compute_single_col_score, get_tresholds
from helper import make_prediction, check_path
from sklearn.model_selection import StratifiedKFold, train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import category_encoders as ce

In [3]:
import catboost as cb
import catboost.datasets as cbd
import catboost.utils as cbu

# MODEL TRAINING

In [4]:
train = pd.read_pickle(cfg.PREPARED_TRAIN_DATA_PATH)
test = pd.read_pickle(cfg.PREPARED_TEST_DATA_PATH)

In [5]:
X_train, Y_train = train.drop(cfg.TARGETS, axis=1), train[cfg.TARGETS]

In [6]:
pred_proba_oof = pd.DataFrame(data=np.zeros(shape=(len(train), len(cfg.TARGETS))), index=train.index, columns=cfg.TARGETS)
pred_proba_test = pd.DataFrame(data=np.zeros(shape=(len(test), len(cfg.TARGETS))), index=test.index, columns=cfg.TARGETS)
metrics = {}

In [7]:
EXPERIMENT_FAMILY_NAME = 'catboost'
EXPERIMENT_NAME = 'cat_encoders'
RANDOM_STATE = 77
N_SPLITS = 6
N_RANDOM_SEEDS = 1
N_DEPTH = 1
CAT_COLS = cfg.CAT_UNORDERED_COLS

In [8]:
def apply_cat_encoding(encoder, X_train, Y_train, X_val, encoder_name) -> tuple[pd.DataFrame, pd.DataFrame]:
    def _rename_cols(df: pd.DataFrame, target_col: str, encoder_name: str) -> pd.DataFrame:
        df.columns = [f'{cat_col}_{target_col}_{encoder_name}' for cat_col in df.columns]
        return df

    X_cat_train, X_cat_val = [], []
    for target_col in Y_train.columns:
        X_cat_train_enc = encoder.fit_transform(X_train, Y_train[target_col])
        X_cat_train_enc = _rename_cols(X_cat_train_enc, target_col, encoder_name)
        X_cat_train.append(X_cat_train_enc)

        X_cat_val_enc = encoder.transform(X_val)
        X_cat_val_enc = _rename_cols(X_cat_val_enc, target_col, encoder_name)
        X_cat_val.append(X_cat_val_enc)
    return pd.concat(X_cat_train, axis=1), pd.concat(X_cat_val, axis=1)

In [9]:
cat_encoders = {
    'WOE': ce.WOEEncoder(cols=CAT_COLS, random_state=RANDOM_STATE),
    # 'TE': ce.TargetEncoder(cols=CAT_COLS)
}

In [10]:
cv = MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

X_test_cats = []
for cat_encoder_name, cat_encoder in cat_encoders.items():
    _, X_test_cat = apply_cat_encoding(
        encoder=cat_encoder,
        X_train=X_train[CAT_COLS],
        Y_train=Y_train,
        X_val=test[CAT_COLS],
        encoder_name=cat_encoder_name
    )
    X_test_cats.append(X_test_cat)

test_pool = cb.Pool(
        data=pd.concat([test, pd.concat(X_test_cats, axis=1)], axis=1),
        cat_features=CAT_COLS
        )

fold = 0
for train_idx, val_idx in tqdm(cv.split(X_train, Y_train), total=N_SPLITS):
    
    X_train_cats, X_val_cats = [], []
    for cat_encoder_name, cat_encoder in cat_encoders.items():
        X_train_cat, X_val_cat = apply_cat_encoding(
            encoder=cat_encoder,
            X_train=X_train[CAT_COLS].iloc[train_idx],
            Y_train=Y_train.iloc[train_idx],
            X_val=X_train[CAT_COLS].iloc[val_idx],
            encoder_name=cat_encoder_name
            )
        X_train_cats.append(X_train_cat)
        X_val_cats.append(X_val_cat)

    train_pool = cb.Pool(
        data=pd.concat([X_train.iloc[train_idx], pd.concat(X_train_cats, axis=1)], axis=1), 
        label=Y_train.iloc[train_idx],
        cat_features=CAT_COLS
        )

    val_pool = cb.Pool(
        data=pd.concat([X_train.iloc[val_idx], pd.concat(X_val_cats, axis=1)], axis=1), 
        label=Y_train.iloc[val_idx],
        cat_features=CAT_COLS
        )
    
    golden_feature_index = str(train_pool.get_feature_names().index('regular_medication_intake'))
    for random_seed in tqdm(range(N_RANDOM_SEEDS), total=N_RANDOM_SEEDS):
        for depth in tqdm(range(3, 3 + N_DEPTH), total=N_DEPTH):
            clf = cb.CatBoostClassifier(
                loss_function='MultiLogloss',
                # custom_metric=['Recall', 'F1'],
                iterations=2000,
                silent=True,
                depth=depth,
                l2_leaf_reg=2.0,
                learning_rate=0.003,
                early_stopping_rounds=300,
                bagging_temperature=1,
                per_float_feature_quantization=f'{golden_feature_index}:border_count=2048',
                random_seed=random_seed
            )

            clf.fit(train_pool, eval_set=val_pool, plot=False)
            
            model_name = f'{EXPERIMENT_NAME}_fold_{fold}_rs_{random_seed}_depth_{depth}.cbm'
            model_path = os.path.join(cfg.MODELS_PATH, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
            check_path(model_path)
            clf.save_model(os.path.join(model_path, model_name))
            
            pred_proba_oof.iloc[val_idx, :] += clf.predict_proba(val_pool)
            pred_proba_test.iloc[:, :] += clf.predict_proba(test_pool)
            gc.collect()

    fold += 1
pred_proba_oof /= (N_RANDOM_SEEDS * N_DEPTH)
pred_proba_test /= (N_SPLITS * N_RANDOM_SEEDS * N_DEPTH)

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

# PREDICT AND SAVE PREDICTIONS

In [11]:
feat_importance = (pd.DataFrame(
    data=clf.feature_importances_, 
    index=clf.feature_names_, 
    columns=['importance'])
    .sort_values('importance', ascending=False)
)
feat_importance.head(25)

Unnamed: 0,importance
regular_medication_intake,24.786012
sex,5.702885
education,5.260189
alcohol,4.645285
profession,4.31967
family,4.279059
sleep_time,4.196172
smoking_status,3.652763
wake_up_time,3.599756
passive_smoking_frequency,3.307591


In [12]:
tresholds = get_tresholds(train[cfg.TARGETS], pred_proba_oof)
sample_submission = pd.read_csv(cfg.SAMPLE_SUBMISSION_PATH).set_index('ID')
submission = make_prediction(pred_proba_test, tresholds, sample_submission)

[0.67534924330617, 0.681348550680293, 0.6110538710627024, 0.6929871377488392, 0.7245280026782489]
0.6770533610952507 0.03710798347376895


In [13]:
## BEST PARAMS
# WOE
# [0.6948668509895227, 0.6880086897986659, 0.6023630476088527, 0.6930805358381811, 0.7352167707718467]
# 0.6827071790014139 0.04357982755536629

# RANDOM_STATE = 77
# N_SPLITS = 5
# N_RANDOM_SEEDS = 7

# loss_function='MultiLogloss',
#             custom_metric=['Recall', 'F1'],
#             iterations=1000,
#             silent=True,
#             depth=6,
#             l2_leaf_reg=2.0,
#             learning_rate=0.01,
#             early_stopping_rounds=100,
#             random_seed=random_seed

In [14]:
submission.to_csv(os.path.join(cfg.SUBMISSION_PATH, EXPERIMENT_FAMILY_NAME, f'{EXPERIMENT_NAME}.csv'))
pred_proba_oof.to_pickle(os.path.join(cfg.OOF_PRED_PATH, EXPERIMENT_FAMILY_NAME, f'{EXPERIMENT_NAME}.pkl'))
pred_proba_test.to_pickle(os.path.join(cfg.TEST_PRED_PATH, EXPERIMENT_FAMILY_NAME, f'{EXPERIMENT_NAME}.pkl'))