In [46]:
import sys
sys.path.insert(0, '..')
import os
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from config import PREPARED_DATA_PATH, UNORDERED_CATEGORIES, ORDERED_CATEGORIES, TEXT_COLS, CITY, POSITION
from helper import _process_pred_labels
from evaluate import get_pred_labels

%matplotlib inline

In [3]:
from sklearn.metrics import f1_score

In [8]:
target = pd.read_pickle('../data/prepared/target.pkl')

In [5]:
oof_pred_proba = pd.read_csv('../oof_predictions/catboost/catboost_pred_proba.csv', index_col='review_id')
oof_pred_labels = pd.read_csv('../oof_predictions/catboost/catboost_pred_labels.csv', index_col='review_id')

In [41]:
test_pred_proba = pd.read_csv('../test_predictions/catboost/catboost_pred_proba.csv', index_col='review_id')
test_pred_labels = pd.read_csv('../test_predictions/catboost/catboost_pred_labels.csv', index_col='review_id')

In [11]:
f1_score(target, oof_pred_labels, average='samples', zero_division=0)

0.20641792306722725

In [15]:
np.where(oof_pred_proba > 0.5, 1, 0)

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0]])

In [61]:
max_f1_score = float('-inf')
best_tresh = None
best_treshes = {}

for col in target.columns:
    max_f1_score = float('-inf')
    best_tresh = None
    tresholds = np.unique(np.round(np.unique(oof_pred_proba[col]), 3))
    for tresh in tresholds:
        score = f1_score(target[col], np.where(oof_pred_proba[col] > tresh, 1, 0), zero_division=0)
        if score > max_f1_score:
            max_f1_score = score
            best_tresh = tresh
    best_treshes[col] = best_tresh
    print(col, max_f1_score, best_tresh)
    

0 0.8628199859798399 0.415
1 0.5416500066640011 0.259
2 0.0007051564565888056 0.447
3 0.718539865513929 0.5
4 0.013422818791946308 0.472
5 0.3852691218130312 0.17
6 0.33681622618315915 0.489
7 0.4721407624633431 0.15
8 0.865229689569981 0.5


In [64]:
for col in target.columns:
    oof_pred_labels.loc[:, col] = np.where(oof_pred_proba[col] > best_treshes[col], 1, 0)

In [68]:
f1_score(target, oof_pred_labels, average='samples', zero_division=0)

0.770707771275819

In [36]:
best_tresh

0.483

In [33]:
max_f1_score

0.8256270147024137

In [31]:
max_f1_score

0.8252312812852162

In [48]:
pred_labels1 = get_pred_labels(oof_pred_proba.values)

In [53]:
def expect_f12(y_prob, thres):
    idxs = np.where(y_prob >= thres)[0]
    tp = y_prob[idxs].sum()
    fp = len(idxs) - tp
    idxs = np.where(y_prob < thres)[0]
    fn = y_prob[idxs].sum()
    return 2*tp / (2*tp + fp + fn)

def optimal_threshold2(y_prob):
    y_prob = np.sort(y_prob)[::-1]
    f1s = [expect_f12(y_prob, p) for p in y_prob]
    thres = y_prob[np.argmax(f1s)]
    return thres, f1s

def get_pred_labels2(pred_proba: np.ndarray) -> np.ndarray:
    pred_labels = np.zeros_like(pred_proba, dtype=np.int8)
    for col_idx in range(9):
        thres, _ = optimal_threshold2(pred_proba[:, col_idx])
        pred_labels[:, col_idx] = np.where(pred_proba[:, col_idx] > thres, 1, 0)
    return pred_labels

In [54]:
pred_labels2 = get_pred_labels2(oof_pred_proba.values)

In [49]:
f1_score(target, pred_labels1, average='samples', zero_division=0)

0.40416229874815457

In [55]:
f1_score(target, pred_labels2, average='samples', zero_division=0)

0.40416229874815457

In [None]:
pred_labels1

In [34]:
oof_pred_labels.loc[:, :] = np.where(oof_pred_proba > best_tresh, 1, 0)

In [43]:
test_pred_labels.loc[:, :] = np.where(test_pred_proba > best_tresh, 1, 0)

In [44]:
test_pred_labels['target'] = test_pred_labels.apply(_process_pred_labels, axis=1)

In [45]:
test_pred_labels['target'].to_csv('../submitions/catboost.csv')

In [16]:
f1_score(target, np.where(oof_pred_proba > 0.5, 1, 0), average='samples', zero_division=0)

0.8240231150247661

In [13]:
for col in target.columns:
    print(col, f1_score(target[col], oof_pred_labels[col]))

0 0.7365091696029743
1 0.4496723239791632
2 0.00033802055164954025
3 0.17414818820984315
4 0.004312790829644761
5 0.19599999999999998
6 0.08575273985771967
7 0.1703056768558952
8 0.22868053982329528


In [2]:
train = pd.read_pickle('../data/prepared/train.pkl')
test = pd.read_pickle('../data/prepared/test.pkl')

In [14]:
orig_train = pd.read_csv('../data/original/train.csv')
orig_test = pd.read_csv('../data/original/test.csv')

In [18]:
orig_test[POSITION].head(30)

0                                    Старший специалист
1                                 Менеджер по персоналу
2                                      Тренинг-менеджер
3                              Специалист отдела кадров
4                                       Бизнес-аналитик
5     Ведущий экономист группы диспетчеризации, норм...
6                                     Делопроизводитель
7                                              Оператор
8                                       Оператор склада
9                                                   NaN
10                                    Директор магазина
11                               Специалист по закупкам
12                                        Администратор
13    Директор по продажам и маркетингу автомобильно...
14                 Специалист Отдела поддержки клиентов
15                                        Комплектовщик
16                                  Представитель банка
17                                              

In [4]:
data = pd.concat((train.assign(kind='train'), test.assign(kind='test')))

In [6]:
from sklearn.model_selection import train_test_split

In [13]:
cv = [tuple(train_test_split(data.index, test_size=0.3, random_state=42, stratify=data[CITY]))]

In [19]:
cv[0][1]

Int64Index([185779,  42163,  22664,  87490, 188471, 159130,  98257, 142425,
            121281, 161059,
            ...
             18917,  76601, 185964, 176107,  44017, 103577, 161499,  84484,
            152856, 116961],
           dtype='int64', name='review_id', length=30459)

In [5]:
data[POSITION].value_counts()

ANOTHER                                        20585
NA                                              6885
Продавец-консультант                            4455
Продавец-кассир                                 2970
Менеджер по продажам                            1984
                                               ...  
Директор проекта                                   2
Оператор мкс                                       2
Специалист по продажам недвижимости                2
Специалист по продажам финансовых продуктов        2
Стажер отдела аудита                               2
Name: position, Length: 3217, dtype: int64

In [44]:
ORDERED_CATEGORIES

['salary_rating',
 'team_rating',
 'managment_rating',
 'career_rating',
 'workplace_rating',
 'rest_recovery_rating']

In [46]:
idx = 5
grb = data.groupby(['kind', ORDERED_CATEGORIES[idx]])[ORDERED_CATEGORIES[idx]].count().unstack('kind')
grb['diff'] = grb['train'] - grb['test']
grb['diff_rel'] = grb['diff'] / ((grb['train'] + grb['test'])/2)

grb.sort_values('diff_rel')

kind,test,train,diff,diff_rel
rest_recovery_rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,7163,6037,-1126,-0.170606
2,5243,4577,-666,-0.135642
1,12137,11714,-423,-0.03547
5,18706,20305,1599,0.081977
4,7402,8243,841,0.10751
