In [32]:
import sys
sys.path.insert(0, "..")
import config as cfg
import gc
import os
from tqdm.notebook import tqdm
from helper import check_path, seed_everything
from collections import defaultdict

In [33]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

In [34]:
EXPERIMENT_FAMILY_NAME = 'blending'
EXPERIMENT_NAME = 'baseline'
N_RANDOM_SEEDS = 1

In [35]:
RANDOM_STATE = 77
seed_everything(RANDOM_STATE)

In [36]:
model_names = {
    'keras': ['labse_emb'], # , 'xlm'
    'catboost': [
        'deeppavlov_optuna',
        # 'ext_text_process'
        # 'deeppavlov',
        # 'rubert_tiny'
    ]
}

In [37]:
train = pd.read_pickle(os.path.join(cfg.PREPROCESSED_DATA_PATH, 'train.pkl'))
test = pd.read_pickle(os.path.join(cfg.PREPROCESSED_DATA_PATH, 'test.pkl'))
X_train, y_train = train.drop(cfg.TARGET, axis=1), train[cfg.TARGET]
CLASSES = np.sort(train[cfg.TARGET].unique()).tolist()

In [38]:
from scipy.special import softmax
from sklearn.preprocessing import *

In [39]:
def scale(df):
    df = MinMaxScaler().fit_transform(df)
    df = softmax(df, axis=1)
    return df

In [40]:
pred_proba_oof = pd.DataFrame(data=np.zeros(shape=(len(train), len(CLASSES))), index=train.index, columns=CLASSES)
pred_proba_test = pd.DataFrame(data=np.zeros(shape=(len(test), len(CLASSES))), index=test.index, columns=CLASSES)
weights = []

n = 0
for model_name in model_names:
    for experiment_name in model_names[model_name]:

        pred_proba_oof_ = pd.read_pickle(os.path.join(cfg.OOF_PRED_PATH, model_name, f'{experiment_name}.pkl'))
        pred_proba_oof_ = train[[]].join(pred_proba_oof_).fillna(pred_proba_oof_.mean())
        pred_proba_oof += scale(pred_proba_oof_)
        pred_proba_test_ = pd.read_pickle(os.path.join(cfg.TEST_PRED_PATH, model_name, f'{experiment_name}.pkl'))
        pred_proba_test_ = test[[]].join(pred_proba_test_).fillna(pred_proba_test_.mean())
        pred_proba_test += scale(pred_proba_test_)
        n += 1
pred_proba_oof /= n
pred_proba_test /= n

pred_proba_oof = pred_proba_oof.fillna(0)
pred_proba_test = pred_proba_test.fillna(0)

In [41]:
leak_test = pd.read_pickle(os.path.join(cfg.DATA_PATH, 'test_leak.pkl'))
leak_mask = leak_test.notnull()

In [42]:
oof_auc_score = roc_auc_score(y_train, pred_proba_oof , multi_class='ovo', labels=CLASSES)
print('oof_auc_score', oof_auc_score)

oof_auc_score 0.8483946596685878


In [43]:
# 0.8518926404809248

In [44]:
leak_test_auc_score = roc_auc_score(leak_test.loc[leak_mask], pred_proba_test.loc[leak_mask], multi_class='ovo', labels=CLASSES)
print('leak_test_auc_score', leak_test_auc_score)

leak_test_auc_score 0.8763701704512742


In [45]:
# minmax
# leak_test_auc_score 0.8865965873387782

In [46]:
submission = pd.read_csv(cfg.SAMPLE_SUBMIT_PATH).set_index('id')
assert submission.index.equals(pred_proba_test.index)
submission[cfg.TARGET] = pred_proba_test.idxmax(1)

submission_path = os.path.join(cfg.SUBMISSION_PATH, EXPERIMENT_FAMILY_NAME)
check_path(submission_path)
submission.to_csv(os.path.join(submission_path, f'{EXPERIMENT_NAME}.csv'))

pred_proba_oof_path = os.path.join(cfg.OOF_PRED_PATH, EXPERIMENT_FAMILY_NAME)
check_path(pred_proba_oof_path)
pred_proba_oof.to_pickle(os.path.join(pred_proba_oof_path, f'{EXPERIMENT_NAME}.pkl'))

pred_proba_test_path = os.path.join(cfg.TEST_PRED_PATH, EXPERIMENT_FAMILY_NAME)
check_path(pred_proba_test_path)
pred_proba_test.to_pickle(os.path.join(pred_proba_test_path, f'{EXPERIMENT_NAME}.pkl'))