In [1]:
import sys
sys.path.insert(0, "../..")
import config as cfg
import gc
import os
from tqdm.notebook import tqdm
from helper import check_path
from collections import defaultdict

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [3]:
import catboost as cb
import catboost.datasets as cbd
import catboost.utils as cbu

In [4]:
EXPERIMENT_FAMILY_NAME = 'catboost'
EXPERIMENT_NAME = 'labse_emb'
N_RANDOM_SEEDS = 1
N_SPLITS = 3

In [5]:
def filter_rare_categories(df: pd.DataFrame) -> pd.DataFrame:
    rare_categories = [12]
    return df.loc[~df[cfg.TARGET].isin(rare_categories)]

In [6]:

train = pd.read_csv(cfg.ORIG_TRAIN_PATH, usecols=[cfg.TARGET, cfg.ID_COL, cfg.TEXT_COL]).set_index('id')  #  cfg.THEME_COL
test = pd.read_csv(cfg.ORIG_TEST_PATH, usecols=[cfg.ID_COL, cfg.TEXT_COL]).set_index('id')

CLASSES = np.sort(train[cfg.TARGET].unique()).tolist()

# train = filter_rare_categories(train)

In [7]:
EMB_NAME = 'smaller_LaBSE_15lang'
train_emb = pd.read_pickle(os.path.join(cfg.DATA_PATH, EMB_NAME, 'train.pkl'))
test_emb = pd.read_pickle(os.path.join(cfg.DATA_PATH, EMB_NAME, 'test.pkl'))

train = train.join(train_emb)
test = test.join(test_emb)

del train_emb, test_emb; gc.collect()

4

In [8]:
X_train, y_train = train.drop(cfg.TARGET, axis=1), train[cfg.TARGET]

In [9]:
pred_proba_oof = pd.DataFrame(data=np.zeros(shape=(len(train), len(CLASSES))), index=train.index, columns=CLASSES)
pred_proba_test = pd.DataFrame(data=np.zeros(shape=(len(test), len(CLASSES))), index=test.index, columns=CLASSES)

In [10]:
cv = StratifiedKFold(n_splits=cfg.N_SPLITS, random_state=cfg.RANDOM_STATE, shuffle=True)

test_pool = cb.Pool(
        data=test,
        text_features=cfg.TEXT_COLS,
        # cat_features=cfg.CAT_COLS
        )


metrics = defaultdict(list)
fold = 0
for train_idx, val_idx in tqdm(cv.split(X_train, y_train), total=cfg.N_SPLITS):

    train_pool = cb.Pool(
        data=X_train.iloc[train_idx], 
        label=y_train.iloc[train_idx],
        text_features=cfg.TEXT_COLS,
        # cat_features=cfg.CAT_COLS
        )

    val_pool = cb.Pool(
        data=X_train.iloc[val_idx], 
        label=y_train.iloc[val_idx],
        text_features=cfg.TEXT_COLS,
        # cat_features=cfg.CAT_COLS
        )
        
    for random_seed in tqdm(range(N_RANDOM_SEEDS), total=N_RANDOM_SEEDS):

        clf = cb.CatBoostClassifier(
            iterations=1000,
            silent=True,
            depth=7,
            eval_metric='AUC',
            classes_count=len(CLASSES),
            random_seed=random_seed,
            early_stopping_rounds=100,
            # task_type="GPU",
            # devices='0:1',
        )

        clf.fit(train_pool, eval_set=val_pool, plot=True)
        
        model_name = f'{EXPERIMENT_NAME}_fold_{fold}_rs_{random_seed}.cbm'
        model_path = os.path.join(cfg.MODELS_PATH, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
        check_path(model_path)
        clf.save_model(os.path.join(model_path, model_name))
        
        pred_proba_oof_val = clf.predict_proba(val_pool)[:, ]
        pred_proba_oof_train = clf.predict_proba(train_pool)

        pred_proba_oof.iloc[val_idx, :] += pred_proba_oof_val
        pred_proba_test.iloc[:, :] += clf.predict_proba(test_pool)

        y_train_oof = y_train.iloc[train_idx]
        y_val_oof = y_train.iloc[val_idx]

        train_auc = roc_auc_score(y_train_oof, pred_proba_oof_train , multi_class='ovo', labels=CLASSES)
        val_auc = roc_auc_score(y_val_oof, pred_proba_oof_val , multi_class='ovo', labels=CLASSES)
        metrics['train_auc'].append(train_auc)
        metrics['val_auc'].append(val_auc)
        print('train auc', train_auc, 'val auc', val_auc)
        
        del clf; gc.collect()
    del train_pool,val_pool; gc.collect() 
        
    fold += 1
pred_proba_oof /= N_RANDOM_SEEDS
pred_proba_test /= (cfg.N_SPLITS * N_RANDOM_SEEDS)

  0%|          | 0/5 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

train auc 0.9982851533502672 val auc 0.7632710176111137


  0%|          | 0/1 [00:00<?, ?it/s]

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

train auc 0.9996490857524087 val auc 0.8192771961021995


  0%|          | 0/1 [00:00<?, ?it/s]

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

train auc 0.9998892785216827 val auc 0.8394567808329837


  0%|          | 0/1 [00:00<?, ?it/s]

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

train auc 0.9996949340519905 val auc 0.7789916591809949


  0%|          | 0/1 [00:00<?, ?it/s]

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Found only 16 unique classes in the data, but have defined 17 classes. Probably something is wrong with data.
Label(s) 12 are not present in the train set. Perhaps, something is wrong with the data.


train auc 0.9999869250969909 val auc 0.8132531429329773


In [11]:
oof_auc_score = roc_auc_score(y_train, pred_proba_oof , multi_class='ovo', labels=CLASSES)
print('oof_auc_score', oof_auc_score)

oof_auc_score 0.7513885107879898


In [12]:
# 0.7608781986622845

In [13]:
leak_test = pd.read_pickle(os.path.join(cfg.DATA_PATH, 'test_leak.pkl'))
notnull = leak_test.notnull()
leak_test_auc_score = roc_auc_score(leak_test.loc[notnull], pred_proba_test.loc[notnull], multi_class='ovo', labels=CLASSES)
print('leak_test_auc_score', leak_test_auc_score)


leak_test_auc_score 0.8054817277654729


In [14]:
# 0.8399045223216896

In [15]:
submission = pd.read_csv(cfg.SAMPLE_SUBMIT_PATH).set_index('id')
assert submission.index.equals(pred_proba_test.index)
submission[cfg.TARGET] = pred_proba_test.idxmax(1)

submission_path = os.path.join(cfg.SUBMISSION_PATH, EXPERIMENT_FAMILY_NAME)
check_path(submission_path)
submission.to_csv(os.path.join(submission_path, f'{EXPERIMENT_NAME}.csv'))

pred_proba_oof_path = os.path.join(cfg.OOF_PRED_PATH, EXPERIMENT_FAMILY_NAME)
check_path(pred_proba_oof_path)
pred_proba_oof.to_pickle(os.path.join(pred_proba_oof_path, f'{EXPERIMENT_NAME}.pkl'))

pred_proba_test_path = os.path.join(cfg.TEST_PRED_PATH, EXPERIMENT_FAMILY_NAME)
check_path(pred_proba_test_path)
pred_proba_test.to_pickle(os.path.join(pred_proba_test_path, f'{EXPERIMENT_NAME}.pkl'))