# IMPORTS

In [62]:
import sys
sys.path.insert(0, "../..")
import config as cfg
import gc

In [63]:
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm
from metrics import *
from helper import *
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [64]:
import catboost as cb
import catboost.datasets as cbd
import catboost.utils as cbu
import hyperopt

# MODEL TRAINING

In [65]:
train = pd.read_pickle(cfg.PREPARED_TRAIN_DATA_PATH)
test = pd.read_pickle(cfg.PREPARED_TEST_DATA_PATH)

In [66]:
X_train, Y_train = train.drop(cfg.TARGETS, axis=1), train[cfg.TARGETS]
ORIGINAL_COLS = X_train.columns

In [67]:
model_names = ['logreg', 'mlp', 'svm', 'gbt', 'rf'] 
experiments = ['baseline']

In [68]:
def asemble_data(base_path: str) -> pd.DataFrame:
    df = []
    for model_name in model_names:
        for experiment in experiments:
            path = os.path.join(base_path, model_name, f'{experiment}.pkl')
            tdf = pd.read_pickle(path)
            tdf.columns = [f'{model_name}_{experiment}_{col}' for col in tdf.columns]
            df.append(tdf)
    return pd.concat(df, axis=1)

In [69]:
X_train = X_train.join(asemble_data(cfg.OOF_PRED_PATH))
test = test.join(asemble_data(cfg.TEST_PRED_PATH))

In [70]:
assert X_train.columns.tolist() == test.columns.tolist()

In [71]:
pred_proba_oof = pd.DataFrame(data=np.zeros(shape=(len(train), len(cfg.TARGETS))), index=train.index, columns=cfg.TARGETS)
pred_proba_test = pd.DataFrame(data=np.zeros(shape=(len(test), len(cfg.TARGETS))), index=test.index, columns=cfg.TARGETS)
metrics = {}

In [72]:
EXPERIMENT_FAMILY_NAME = 'stacking'
EXPERIMENT_NAME = 'baseline'
RANDOM_STATE = 77
N_SPLITS = 5
N_RANDOM_SEEDS = 7

In [73]:
X_train, test = X_train.drop(ORIGINAL_COLS, axis=1), test.drop(ORIGINAL_COLS, axis=1)

In [74]:
cv = MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

CAT_COLS = cfg.CAT_UNORDERED_COLS

test_pool = cb.Pool(
        data=test,
        # cat_features=CAT_COLS
        )

fold = 0
for train_idx, val_idx in tqdm(cv.split(X_train, Y_train), total=N_SPLITS):

    train_pool = cb.Pool(
        data=X_train.iloc[train_idx], 
        label=Y_train.iloc[train_idx],
        # cat_features=CAT_COLS
        )

    val_pool = cb.Pool(
        data=X_train.iloc[val_idx], 
        label=Y_train.iloc[val_idx],
        # cat_features=CAT_COLS
        )
        

    clf = cb.CatBoostClassifier(
        loss_function='MultiLogloss',
        custom_metric=['Recall', 'F1'],
        silent=True,
        iterations=1200,
        depth=2,
        l2_leaf_reg=5.0,
        learning_rate=0.0001,
        early_stopping_rounds=100
    )

    clf.fit(train_pool, eval_set=val_pool, plot=False)
    
    model_name = f'{EXPERIMENT_NAME}_fold_{fold}.pkl'
    model_path = os.path.join(cfg.MODELS_PATH, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
    check_path(model_path)
    clf.save_model(os.path.join(model_path, model_name))
    
    pred_proba_oof.iloc[val_idx, :] += clf.predict_proba(val_pool)
    pred_proba_test.iloc[:, :] += clf.predict_proba(test_pool)
    gc.collect()

    fold += 1
    
pred_proba_test /= N_SPLITS

  0%|          | 0/5 [00:00<?, ?it/s]

# PREDICT AND SAVE PREDICTIONS

In [75]:
tresholds = get_tresholds(train[cfg.TARGETS], pred_proba_oof)
sample_submission = pd.read_csv(cfg.SAMPLE_SUBMISSION_PATH).set_index('ID')
submission = make_prediction(pred_proba_test, tresholds, sample_submission)

0.6790305016420121


In [76]:
## BEST PARAMS
# 0.6912250203448425

# model_names = ['catboost', 'logreg', 'rf', 'mlp'] # 'gbt',  
# experiments = ['baseline']

# cv = MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

# CAT_COLS = cfg.CAT_UNORDERED_COLS

# test_pool = cb.Pool(
#         data=test,
#         cat_features=CAT_COLS
#         )

# fold = 0
# for train_idx, val_idx in tqdm(cv.split(X_train, Y_train), total=N_SPLITS):

#     train_pool = cb.Pool(
#         data=X_train.iloc[train_idx], 
#         label=Y_train.iloc[train_idx],
#         cat_features=CAT_COLS
#         )

#     val_pool = cb.Pool(
#         data=X_train.iloc[val_idx], 
#         label=Y_train.iloc[val_idx],
#         cat_features=CAT_COLS
#         )
        

#     clf = cb.CatBoostClassifier(
#         loss_function='MultiLogloss',
#         custom_metric=['Recall', 'F1'],
#         silent=True,
#         iterations=1200,
#         depth=2,
#         l2_leaf_reg=5.0,
#         learning_rate=0.0001,
#         early_stopping_rounds=100
#     )

#     clf.fit(train_pool, eval_set=val_pool, plot=False)
    
#     model_name = f'{EXPERIMENT_NAME}_fold_{fold}.pkl'
#     model_path = os.path.join(cfg.MODELS_PATH, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
#     check_path(model_path)
#     clf.save_model(os.path.join(model_path, model_name))
    
#     pred_proba_oof.iloc[val_idx, :] += clf.predict_proba(val_pool)
#     pred_proba_test.iloc[:, :] += clf.predict_proba(test_pool)
#     gc.collect()

#     fold += 1
    
# pred_proba_test /= N_SPLITS

In [77]:
submission_path = os.path.join(cfg.SUBMISSION_PATH, EXPERIMENT_FAMILY_NAME)
check_path(submission_path)
submission.to_csv(os.path.join(submission_path, f'{EXPERIMENT_NAME}.csv'))

pred_proba_oof_path = os.path.join(cfg.OOF_PRED_PATH, EXPERIMENT_FAMILY_NAME)
check_path(pred_proba_oof_path)
pred_proba_oof.to_pickle(os.path.join(pred_proba_oof_path, f'{EXPERIMENT_NAME}.pkl'))

pred_proba_test_path = os.path.join(cfg.TEST_PRED_PATH, EXPERIMENT_FAMILY_NAME)
check_path(pred_proba_test_path)
pred_proba_test.to_pickle(os.path.join(pred_proba_test_path, f'{EXPERIMENT_NAME}.pkl'))