In [1]:
import sys
sys.path.insert(0, "../..")
import config as cfg
import gc
import os
from tqdm.notebook import tqdm
from helper import check_path
from collections import defaultdict

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [3]:
import catboost as cb
import catboost.datasets as cbd
import catboost.utils as cbu

In [4]:
EXPERIMENT_FAMILY_NAME = 'catboost'
EXPERIMENT_NAME = 'ext_text_process'
N_RANDOM_SEEDS = 5

In [5]:
def filter_rare_categories(df: pd.DataFrame) -> pd.DataFrame:
    rare_categories = [12]
    return df.loc[~df[cfg.TARGET].isin(rare_categories)]

In [6]:
train = pd.read_csv(cfg.ORIG_TRAIN_PATH).set_index('id')
test = pd.read_csv(cfg.ORIG_TEST_PATH).set_index('id')

CLASSES = np.sort(train[cfg.TARGET].unique()).tolist()

# train = filter_rare_categories(train)

In [7]:
X_train, y_train = train.drop(cfg.TARGET, axis=1), train[cfg.TARGET]

In [8]:
pred_proba_oof = pd.DataFrame(data=np.zeros(shape=(len(train), len(CLASSES))), index=train.index, columns=CLASSES)
pred_proba_test = pd.DataFrame(data=np.zeros(shape=(len(test), len(CLASSES))), index=test.index, columns=CLASSES)

In [9]:
text_processing_options = {
        "tokenizers" : [{
            "tokenizer_id" : "Sense",
            "lowercasing" : "true",
            'separator_type': 'BySense'
        }],

        "dictionaries" : [{
            "dictionary_id" : "1-GramWord",
            "token_level_type": "Word",
            "gram_order" : "1"
        },{
            "dictionary_id" : "2-GramWord",
            "token_level_type": "Word",
            "gram_order" : "2"
        },{
            "dictionary_id" : "3-GramWord",
            "token_level_type": "Word",
            "gram_order" : "3"
        },{
            "dictionary_id" : "1-GramLetter",
            "token_level_type": "Letter",
            "gram_order" : "1"
        },{
            "dictionary_id" : "2-GramLetter",
            "token_level_type": "Letter",
            "gram_order" : "2"
        },{
            "dictionary_id" : "3-GramLetter",
            "token_level_type": "Letter",
            "gram_order" : "3"
        },{
            "dictionary_id" : "4-GramLetter",
            "token_level_type": "Letter",
            "gram_order" : "4"
        },{
            "dictionary_id" : "5-GramLetter",
            "token_level_type": "Letter",
            "gram_order" : "5"
        }],

        "feature_processing" : {
            "default" : [{
                "dictionaries_names" : [
                    "1-GramWord", "2-GramWord", "3-GramWord", 
                    "1-GramLetter", "2-GramLetter", "3-GramLetter", "4-GramLetter", "5-GramLetter"],
                "feature_calcers" : ["BoW", "NaiveBayes", "BM25"],
                "tokenizers_names" : ["Sense"]
            }]
        }
    }

In [10]:
cv = StratifiedKFold(n_splits=cfg.N_SPLITS, random_state=cfg.RANDOM_STATE, shuffle=True)

test_pool = cb.Pool(
        data=test,
        text_features=cfg.TEXT_COLS,
        cat_features=cfg.CAT_COLS)


metrics = defaultdict(list)
fold = 0
for train_idx, val_idx in tqdm(cv.split(X_train, y_train), total=cfg.N_SPLITS):

    train_pool = cb.Pool(
        data=X_train.iloc[train_idx], 
        label=y_train.iloc[train_idx],
        text_features=cfg.TEXT_COLS,
        cat_features=cfg.CAT_COLS)

    val_pool = cb.Pool(
        data=X_train.iloc[val_idx], 
        label=y_train.iloc[val_idx],
        text_features=cfg.TEXT_COLS,
        cat_features=cfg.CAT_COLS)
        
    for random_seed in tqdm(range(N_RANDOM_SEEDS), total=N_RANDOM_SEEDS):

        clf = cb.CatBoostClassifier(
            iterations=2000,
            silent=True,
            classes_count=len(CLASSES),
            text_processing=text_processing_options,
            random_seed=random_seed,
            early_stopping_rounds=150
        )

        clf.fit(train_pool, eval_set=val_pool, plot=False)
        
        model_name = f'{EXPERIMENT_NAME}_fold_{fold}_rs_{random_seed}.cbm'
        model_path = os.path.join(cfg.MODELS_PATH, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
        check_path(model_path)
        clf.save_model(os.path.join(model_path, model_name))
        
        pred_proba_oof_val = clf.predict_proba(val_pool)
        pred_proba_oof_train = clf.predict_proba(train_pool)

        pred_proba_oof.iloc[val_idx, :] += pred_proba_oof_val
        pred_proba_test.iloc[:, :] += clf.predict_proba(test_pool)

        y_train_oof = y_train.iloc[train_idx]
        y_val_oof = y_train.iloc[val_idx]

        train_auc = roc_auc_score(y_train_oof, pred_proba_oof_train , multi_class='ovo', labels=CLASSES)
        val_auc = roc_auc_score(y_val_oof, pred_proba_oof_val , multi_class='ovo', labels=CLASSES)
        metrics['train_auc'].append(train_auc)
        metrics['val_auc'].append(val_auc)
        print('train auc', train_auc, 'val auc', val_auc)
        
        del clf; gc.collect()
    del train_pool,val_pool; gc.collect() 
        
    fold += 1
pred_proba_oof /= N_RANDOM_SEEDS
pred_proba_test /= (cfg.N_SPLITS * N_RANDOM_SEEDS)

  0%|          | 0/5 [00:00<?, ?it/s]



  0%|          | 0/5 [00:00<?, ?it/s]

train auc 0.9996527777777778 val auc 0.9770673279317476
train auc 0.9981617647058824 val auc 0.9805492361284247
train auc 0.9999781162464985 val auc 0.9777038572086564
train auc 0.9999562324929973 val auc 0.9856837724884191
train auc 1.0 val auc 0.993071233150858


  0%|          | 0/5 [00:00<?, ?it/s]

train auc 1.0 val auc 0.9687887995512341
train auc 1.0 val auc 0.9635182934430316
train auc 0.9991830065359477 val auc 0.9807051182165711
train auc 0.9993872549019609 val auc 0.9753664921465968


In [None]:
oof_auc_score = roc_auc_score(y_train, pred_proba_oof , multi_class='ovo', labels=CLASSES)
print('oof_auc_score', oof_auc_score)
print('scores', metrics['val_auc'], np.mean(metrics['val_auc']), np.std(metrics['val_auc']))

In [None]:
submission = pd.read_csv(cfg.SAMPLE_SUBMIT_PATH).set_index('id')
assert submission.index.equals(pred_proba_test.index)
submission[cfg.TARGET] = pred_proba_test.idxmax(1)

submission_path = os.path.join(cfg.SUBMISSION_PATH, EXPERIMENT_FAMILY_NAME)
check_path(submission_path)
submission.to_csv(os.path.join(submission_path, f'{EXPERIMENT_NAME}.csv'))

pred_proba_oof_path = os.path.join(cfg.OOF_PRED_PATH, EXPERIMENT_FAMILY_NAME)
check_path(pred_proba_oof_path)
pred_proba_oof.to_pickle(os.path.join(pred_proba_oof_path, f'{EXPERIMENT_NAME}.pkl'))

pred_proba_test_path = os.path.join(cfg.TEST_PRED_PATH, EXPERIMENT_FAMILY_NAME)
check_path(pred_proba_test_path)
pred_proba_test.to_pickle(os.path.join(pred_proba_test_path, f'{EXPERIMENT_NAME}.pkl'))