In [None]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/96/3b/bb419654adcf7efff42ed8a3f84e50c8f236424b7ed1cc8ccd290852e003/catboost-0.24.4-cp37-none-manylinux1_x86_64.whl (65.7MB)
[K     |████████████████████████████████| 65.7MB 97kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.4


In [None]:
import io
import os
import gc
import re
import random
import pickle
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import  accuracy_score

from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier, Pool

In [None]:
def save_pkl(dir, name, obj):
    dir.mkdir(exist_ok=True)
    with open(dir / name, 'wb') as f:
        pickle.dump(obj, f)

def load_pkl(dir, name):
    with open(dir / name, 'rb') as f:
        return pickle.load(f)

def set_seed(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

In [None]:
set_seed()

# Load Train

In [None]:
train_df = pd.read_csv("../input/Train.csv")
test_df = pd.read_csv("../input/Test.csv")

In [None]:
LABEL2ID = {label:i for i, label in enumerate(train_df['label'].unique())}
ID2LABEL = {v:k for k, v in LABEL2ID.items()}

train_df['label_ids'] = train_df['label'].map(LABEL2ID)

In [None]:
train_df

Unnamed: 0,ID,text,label,label_ids
0,13P0QT0,3sbaaaaaaaaaaaaaaaaaaaa lek ou le seim riahi o...,-1,0
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,-1,0
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,-1,0
3,U0TTYY8,ak slouma,1,1
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,-1,0
...,...,...,...,...
69995,ZRSR7TZ,pff bayna beli kbira f wejhakk yakhiii rouhi r...,-1,0
69996,QNQVEIH,aman lmara jeya zidou t3am9ou fel a7deeth akth...,-1,0
69997,LJ2K9MD,winha nakhtabha hhhhh,-1,0
69998,5RZ1T7I,fachel enta w houwa,-1,0


In [None]:
train_targets = train_df['label_ids'].values

In [None]:
train_text = train_df[['text']]
test_text = train_df[['text']]

In [None]:
test_features = test_df[['text']]

In [None]:
cv = list(StratifiedKFold(n_splits=5).split(train_df, train_df['label_ids']))

In [None]:
catboost_params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'task_type': 'GPU',
    'early_stopping_rounds': 50,
    'use_best_model': True,
    'verbose': 50,
}

In [None]:
%%time
val_scores = []
val_preds = np.zeros((len(train_df), len(ID2LABEL)), dtype="float32")
test_preds = np.zeros((5, len(test_df), len(ID2LABEL)), dtype="float32")

for fold in range(5):
    print('='*30)
    print(f'======fold: {fold} start======')

    trn_idx, val_idx = cv[fold]
    trn_features, val_features = train_text.loc[trn_idx], train_text.loc[val_idx]
    trn_targets, val_targets = train_targets[trn_idx], train_targets[val_idx]

    train_pool = Pool(
        trn_features, 
        trn_targets, 
        text_features=['text'],
    )
    valid_pool = Pool(
        val_features, 
        val_targets, 
        text_features=['text'],
    )

    model = CatBoostClassifier(**catboost_params)
    model.fit(train_pool, eval_set=valid_pool)
    val_pred = model.predict(val_features)
    score = accuracy_score(val_targets, val_pred)

    print(f"score {score:.4f}")

    val_preds[val_idx] = model.predict_proba(val_features)
    test_preds[fold] = model.predict_proba(test_features)

0:	learn: 0.7601250	test: 0.7763571	best: 0.7763571 (0)	total: 17.6ms	remaining: 17.6s
50:	learn: 0.7733750	test: 0.7858571	best: 0.7858571 (50)	total: 672ms	remaining: 12.5s
100:	learn: 0.7758214	test: 0.7870714	best: 0.7872143 (73)	total: 1.26s	remaining: 11.3s
bestTest = 0.7872142857
bestIteration = 73
Shrink model to first 74 iterations.
score 0.7872
0:	learn: 0.7655536	test: 0.7772857	best: 0.7772857 (0)	total: 16ms	remaining: 16s
50:	learn: 0.7744286	test: 0.7855000	best: 0.7855000 (49)	total: 626ms	remaining: 11.6s
100:	learn: 0.7765357	test: 0.7863571	best: 0.7863571 (100)	total: 1.19s	remaining: 10.5s
150:	learn: 0.7770714	test: 0.7867143	best: 0.7871429 (147)	total: 1.75s	remaining: 9.85s
200:	learn: 0.7786250	test: 0.7872857	best: 0.7872857 (200)	total: 2.31s	remaining: 9.16s
250:	learn: 0.7793929	test: 0.7862143	best: 0.7872857 (200)	total: 2.86s	remaining: 8.54s
bestTest = 0.7872857143
bestIteration = 200
Shrink model to first 201 iterations.
score 0.7873
0:	learn: 0.76712

In [None]:
print(f"all oof score {accuracy_score(train_targets, np.argmax(val_preds, axis=-1)):.4f}")

all oof score 0.7893


In [None]:
save_pkl(Path("../output"), "94-catboost-val_preds.pkl", val_preds)
save_pkl(Path("../output"), "94-catboost-test_preds.pkl", test_preds)