In [1]:
import sys
sys.path.insert(0, "../..")
import config as cfg
import gc
import os
from tqdm.notebook import tqdm
from helper import check_path, seed_everything
from collections import defaultdict

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [3]:
import catboost as cb
import catboost.datasets as cbd
import catboost.utils as cbu

In [4]:
EXPERIMENT_FAMILY_NAME = 'catboost'
EXPERIMENT_NAME = 'ext_text_process'
N_RANDOM_SEEDS = 1

In [5]:
RANDOM_STATE = 77
seed_everything(RANDOM_STATE)

In [6]:
def filter_rare_categories(df: pd.DataFrame) -> pd.DataFrame:
    counts = df[cfg.TARGET].value_counts()
    rare_categories = counts.loc[counts < cfg.N_SPLITS + 3].index
    return df.loc[~df[cfg.TARGET].isin(rare_categories)]

In [7]:
train = pd.read_pickle(os.path.join(cfg.PREPROCESSED_DATA_PATH, 'train.pkl'))
test = pd.read_pickle(os.path.join(cfg.PREPROCESSED_DATA_PATH, 'test.pkl'))

CLASSES = np.sort(train[cfg.TARGET].unique()).tolist()

# train = filter_rare_categories(train)

In [8]:
EMB_NAME = 'smaller_LaBSE_15lang'
train_emb = pd.read_pickle(os.path.join(cfg.DATA_PATH, EMB_NAME, 'train.pkl'))
test_emb = pd.read_pickle(os.path.join(cfg.DATA_PATH, EMB_NAME, 'test.pkl'))

train = train.join(train_emb)
test = test.join(test_emb)

del train_emb, test_emb; gc.collect()

0

In [9]:
X_train, y_train = train.drop(cfg.TARGET, axis=1), train[cfg.TARGET]

In [10]:
pred_proba_oof = pd.DataFrame(data=np.zeros(shape=(len(train), len(CLASSES))), index=train.index, columns=CLASSES)
pred_proba_test = pd.DataFrame(data=np.zeros(shape=(len(test), len(CLASSES))), index=test.index, columns=CLASSES)

In [11]:
text_processing_options = {
        "tokenizers" : [{
            "tokenizer_id" : "Sense",
            "lowercasing" : "true",
            'separator_type': 'BySense'
        }],

        "dictionaries" : [{
            "dictionary_id" : "1-GramWord",
            "token_level_type": "Word",
            "gram_order" : "1"
        },{
            "dictionary_id" : "2-GramWord",
            "token_level_type": "Word",
            "gram_order" : "2"
        },{
            "dictionary_id" : "3-GramWord",
            "token_level_type": "Word",
            "gram_order" : "3"
        },{
            "dictionary_id" : "1-GramLetter",
            "token_level_type": "Letter",
            "gram_order" : "1"
        },{
            "dictionary_id" : "2-GramLetter",
            "token_level_type": "Letter",
            "gram_order" : "2"
        },{
            "dictionary_id" : "3-GramLetter",
            "token_level_type": "Letter",
            "gram_order" : "3"
        },{
            "dictionary_id" : "4-GramLetter",
            "token_level_type": "Letter",
            "gram_order" : "4"
        },{
            "dictionary_id" : "5-GramLetter",
            "token_level_type": "Letter",
            "gram_order" : "5"
        }],

        "feature_processing" : {
            "default" : [{
                "dictionaries_names" : [
                    "1-GramWord", "2-GramWord", "3-GramWord", 
                    "1-GramLetter", "2-GramLetter", "3-GramLetter", "4-GramLetter", "5-GramLetter"],
                "feature_calcers" : ["BoW", "NaiveBayes", "BM25"],
                "tokenizers_names" : ["Sense"]
            }]
        }
    }

In [12]:
leak_test = pd.read_pickle(os.path.join(cfg.DATA_PATH, 'test_leak.pkl'))
leak_mask = leak_test.notnull()

In [13]:
cv = StratifiedKFold(n_splits=cfg.N_SPLITS, random_state=cfg.RANDOM_STATE, shuffle=True)

test_pool = cb.Pool(
        data=test,
        text_features=cfg.TEXT_COLS,
        # cat_features=cfg.CAT_COLS
        )


metrics = defaultdict(list)
fold = 0
for train_idx, val_idx in tqdm(cv.split(X_train, y_train), total=cfg.N_SPLITS):

    train_pool = cb.Pool(
        data=X_train.iloc[train_idx], 
        label=y_train.iloc[train_idx],
        text_features=cfg.TEXT_COLS,
        # cat_features=cfg.CAT_COLS
        )

    val_pool = cb.Pool(
        data=X_train.iloc[val_idx], 
        label=y_train.iloc[val_idx],
        text_features=cfg.TEXT_COLS,
        # cat_features=cfg.CAT_COLS
        )
        
    for random_seed in tqdm(range(N_RANDOM_SEEDS), total=N_RANDOM_SEEDS):

        clf = cb.CatBoostClassifier(
            iterations=2000,
            silent=True,
            classes_count=len(CLASSES),
            eval_metric='AUC',
            # text_processing=text_processing_options,
            random_seed=random_seed,
            early_stopping_rounds=150,
            # task_type="GPU",
            # devices='0:1',
        )
        

        # summary = clf.select_features(
        #     train_pool,
        #     eval_set=val_pool,
        #     features_for_select=X_train.columns.tolist(),     # we will select from all features
        #     num_features_to_select=500,  # we want to select exactly important features
        #     steps=1,                                     # more steps - more accurate selection
        #     # algorithm=algorithm,
        #     # shap_calc_type=EShapCalcType.Regular,            # can be Approximate, Regular and Exact
        #     train_final_model=True,                          # to train model with selected features
        #     logging_level='Silent',
        #     plot=True
        # )
        
        model_name = f'{EXPERIMENT_NAME}_fold_{fold}_rs_{random_seed}.cbm'
        model_path = os.path.join(cfg.MODELS_PATH, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
        check_path(model_path)
        if False:
            clf.fit(train_pool, eval_set=val_pool, plot=False)
            clf.save_model(os.path.join(model_path, model_name))
        else:
            clf.load_model(os.path.join(model_path, model_name))
        
        pred_proba_oof_val = clf.predict_proba(val_pool)[:, :len(CLASSES)]
        pred_proba_oof_train = clf.predict_proba(train_pool)[:, :len(CLASSES)]

        pred_proba_oof.iloc[val_idx, :] += pred_proba_oof_val
        pred_proba_test.iloc[:, :] += clf.predict_proba(test_pool)[:, :len(CLASSES)]

        y_train_oof = y_train.iloc[train_idx]
        y_val_oof = y_train.iloc[val_idx]

        train_auc = roc_auc_score(y_train_oof, pred_proba_oof_train , multi_class='ovo', labels=CLASSES)
        val_auc = roc_auc_score(y_val_oof, pred_proba_oof_val , multi_class='ovo', labels=CLASSES)
        metrics['train_auc'].append(train_auc)
        metrics['val_auc'].append(val_auc)
        print('train auc', train_auc, 'val auc', val_auc)

        leak_test_auc_score = roc_auc_score(leak_test.loc[leak_mask], pred_proba_test.loc[leak_mask] / (fold + 1), multi_class='ovo', labels=CLASSES)
        metrics['leak_test_auc_score'].append(leak_test_auc_score)
        print('leak_test_auc_score', leak_test_auc_score)

        
        del clf; gc.collect()
    del train_pool,val_pool; gc.collect() 
        
    fold += 1
pred_proba_oof /= N_RANDOM_SEEDS
pred_proba_test /= (cfg.N_SPLITS * N_RANDOM_SEEDS)

  0%|          | 0/5 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

There are invalid params and some of them will be ignored.
Parameter {"train_final_model":true} is ignored, because it cannot be parsed.
Parameter {"num_features_to_select":500} is ignored, because it cannot be parsed.
Parameter {"features_for_select":"Текст Сообщения,smaller_LaBSE_15lang_0,smaller_LaBSE_15lang_1,smaller_LaBSE_15lang_2,smaller_LaBSE_15lang_3,smaller_LaBSE_15lang_4,smaller_LaBSE_15lang_5,smaller_LaBSE_15lang_6,smaller_LaBSE_15lang_7,smaller_LaBSE_15lang_8,smaller_LaBSE_15lang_9,smaller_LaBSE_15lang_10,smaller_LaBSE_15lang_11,smaller_LaBSE_15lang_12,smaller_LaBSE_15lang_13,smaller_LaBSE_15lang_14,smaller_LaBSE_15lang_15,smaller_LaBSE_15lang_16,smaller_LaBSE_15lang_17,smaller_LaBSE_15lang_18,smaller_LaBSE_15lang_19,smaller_LaBSE_15lang_20,smaller_LaBSE_15lang_21,smaller_LaBSE_15lang_22,smaller_LaBSE_15lang_23,smaller_LaBSE_15lang_24,smaller_LaBSE_15lang_25,smaller_LaBSE_15lang_26,smaller_LaBSE_15lang_27,smaller_LaBSE_15lang_28,smaller_LaBSE_15lang_29,smaller_LaBSE_15lang_

train auc 0.9999998234079165 val auc 0.8417047000047655
leak_test_auc_score 0.8268383029189277


  0%|          | 0/1 [00:00<?, ?it/s]

There are invalid params and some of them will be ignored.
Parameter {"train_final_model":true} is ignored, because it cannot be parsed.
Parameter {"num_features_to_select":500} is ignored, because it cannot be parsed.
Parameter {"features_for_select":"Текст Сообщения,smaller_LaBSE_15lang_0,smaller_LaBSE_15lang_1,smaller_LaBSE_15lang_2,smaller_LaBSE_15lang_3,smaller_LaBSE_15lang_4,smaller_LaBSE_15lang_5,smaller_LaBSE_15lang_6,smaller_LaBSE_15lang_7,smaller_LaBSE_15lang_8,smaller_LaBSE_15lang_9,smaller_LaBSE_15lang_10,smaller_LaBSE_15lang_11,smaller_LaBSE_15lang_12,smaller_LaBSE_15lang_13,smaller_LaBSE_15lang_14,smaller_LaBSE_15lang_15,smaller_LaBSE_15lang_16,smaller_LaBSE_15lang_17,smaller_LaBSE_15lang_18,smaller_LaBSE_15lang_19,smaller_LaBSE_15lang_20,smaller_LaBSE_15lang_21,smaller_LaBSE_15lang_22,smaller_LaBSE_15lang_23,smaller_LaBSE_15lang_24,smaller_LaBSE_15lang_25,smaller_LaBSE_15lang_26,smaller_LaBSE_15lang_27,smaller_LaBSE_15lang_28,smaller_LaBSE_15lang_29,smaller_LaBSE_15lang_

train auc 0.9999972583706844 val auc 0.9135079909833286
leak_test_auc_score 0.8338649320737411


  0%|          | 0/1 [00:00<?, ?it/s]

There are invalid params and some of them will be ignored.
Parameter {"train_final_model":true} is ignored, because it cannot be parsed.
Parameter {"num_features_to_select":500} is ignored, because it cannot be parsed.
Parameter {"features_for_select":"Текст Сообщения,smaller_LaBSE_15lang_0,smaller_LaBSE_15lang_1,smaller_LaBSE_15lang_2,smaller_LaBSE_15lang_3,smaller_LaBSE_15lang_4,smaller_LaBSE_15lang_5,smaller_LaBSE_15lang_6,smaller_LaBSE_15lang_7,smaller_LaBSE_15lang_8,smaller_LaBSE_15lang_9,smaller_LaBSE_15lang_10,smaller_LaBSE_15lang_11,smaller_LaBSE_15lang_12,smaller_LaBSE_15lang_13,smaller_LaBSE_15lang_14,smaller_LaBSE_15lang_15,smaller_LaBSE_15lang_16,smaller_LaBSE_15lang_17,smaller_LaBSE_15lang_18,smaller_LaBSE_15lang_19,smaller_LaBSE_15lang_20,smaller_LaBSE_15lang_21,smaller_LaBSE_15lang_22,smaller_LaBSE_15lang_23,smaller_LaBSE_15lang_24,smaller_LaBSE_15lang_25,smaller_LaBSE_15lang_26,smaller_LaBSE_15lang_27,smaller_LaBSE_15lang_28,smaller_LaBSE_15lang_29,smaller_LaBSE_15lang_

train auc 0.9975638198522243 val auc 0.8362607223161956
leak_test_auc_score 0.8274280228413495


  0%|          | 0/1 [00:00<?, ?it/s]

There are invalid params and some of them will be ignored.
Parameter {"train_final_model":true} is ignored, because it cannot be parsed.
Parameter {"num_features_to_select":500} is ignored, because it cannot be parsed.
Parameter {"features_for_select":"Текст Сообщения,smaller_LaBSE_15lang_0,smaller_LaBSE_15lang_1,smaller_LaBSE_15lang_2,smaller_LaBSE_15lang_3,smaller_LaBSE_15lang_4,smaller_LaBSE_15lang_5,smaller_LaBSE_15lang_6,smaller_LaBSE_15lang_7,smaller_LaBSE_15lang_8,smaller_LaBSE_15lang_9,smaller_LaBSE_15lang_10,smaller_LaBSE_15lang_11,smaller_LaBSE_15lang_12,smaller_LaBSE_15lang_13,smaller_LaBSE_15lang_14,smaller_LaBSE_15lang_15,smaller_LaBSE_15lang_16,smaller_LaBSE_15lang_17,smaller_LaBSE_15lang_18,smaller_LaBSE_15lang_19,smaller_LaBSE_15lang_20,smaller_LaBSE_15lang_21,smaller_LaBSE_15lang_22,smaller_LaBSE_15lang_23,smaller_LaBSE_15lang_24,smaller_LaBSE_15lang_25,smaller_LaBSE_15lang_26,smaller_LaBSE_15lang_27,smaller_LaBSE_15lang_28,smaller_LaBSE_15lang_29,smaller_LaBSE_15lang_

train auc 0.9996004319969446 val auc 0.802778276627054
leak_test_auc_score 0.8282549465979531


  0%|          | 0/1 [00:00<?, ?it/s]

train auc 0.8443144569995104 val auc 0.7540541619425549
leak_test_auc_score 0.83739775427416


In [14]:
oof_auc_score = roc_auc_score(y_train, pred_proba_oof , multi_class='ovo', labels=CLASSES)
print('oof_auc_score', oof_auc_score)
print('scores', metrics['val_auc'])
print('mean', np.mean(metrics['val_auc']), 'std', np.std(metrics['val_auc']))

oof_auc_score 0.7734803598034716
scores [0.8417047000047655, 0.9135079909833286, 0.8362607223161956, 0.802778276627054, 0.7540541619425549]
mean 0.8296611703747796 std 0.052264710308696796


In [15]:
# oof_auc_score 0.8803946552408488
# scores [0.9021272334648198, 0.9123958224097881, 0.8907513325956924, 0.8669408784884531, 0.845440027286832] 
# 0.8835310588491172 0.024319890660136585

In [16]:
print('scores', metrics['leak_test_auc_score'])
print('mean', np.mean(metrics['leak_test_auc_score']), 'std', np.std(metrics['leak_test_auc_score']))

leak_test_auc_score = roc_auc_score(leak_test.loc[leak_mask], pred_proba_test.loc[leak_mask], multi_class='ovo', labels=CLASSES)
print('leak_test_auc_score', leak_test_auc_score)

scores [0.8268383029189277, 0.8338649320737411, 0.8274280228413495, 0.8282549465979531, 0.83739775427416]
mean 0.8307567917412262 std 0.004158302678993438
leak_test_auc_score 0.83739775427416


In [17]:
# leak_test_auc_score 0.8253548018923035

In [18]:
submission = pd.read_csv(cfg.SAMPLE_SUBMIT_PATH).set_index('id')
assert submission.index.equals(pred_proba_test.index)
submission[cfg.TARGET] = pred_proba_test.idxmax(1)

submission_path = os.path.join(cfg.SUBMISSION_PATH, EXPERIMENT_FAMILY_NAME)
check_path(submission_path)
submission.to_csv(os.path.join(submission_path, f'{EXPERIMENT_NAME}.csv'))

pred_proba_oof_path = os.path.join(cfg.OOF_PRED_PATH, EXPERIMENT_FAMILY_NAME)
check_path(pred_proba_oof_path)
pred_proba_oof.to_pickle(os.path.join(pred_proba_oof_path, f'{EXPERIMENT_NAME}.pkl'))

pred_proba_test_path = os.path.join(cfg.TEST_PRED_PATH, EXPERIMENT_FAMILY_NAME)
check_path(pred_proba_test_path)
pred_proba_test.to_pickle(os.path.join(pred_proba_test_path, f'{EXPERIMENT_NAME}.pkl'))