In [43]:
import warnings
warnings.filterwarnings("ignore")

import torch
from torch import nn
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
import joblib
import biGRU_model 
import gbm_model
from deeppavlov.core.data.simple_vocab import SimpleVocabulary
import pandas as pd
import numpy as np
from pandarallel import pandarallel
from sklearn.metrics import roc_auc_score, f1_score

pandarallel.initialize()

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Попробуем объединить модели градиетного бустинга и рекурентной нейронной сети в ансамбль и посмотрим на качество

In [44]:
gbm = CatBoostClassifier()      
gbm.load_model('models/catboost_1')
vocab = SimpleVocabulary(save_path="./models/vocab.dict")

gru = torch.load('models/biGRU')
device = torch.device('cpu') 
gru = biGRU_model.BiGRU(vocab.count, embedding_dim=10, hidden_size=50, device='cpu') 
gru.load_state_dict(torch.load('models/biGRU', map_location=device))

#tfidf = tfidf = joblib.load('models/tfidf.pkl') 
#svdt = tfidf = joblib.load('models/svdt.pkl') 


2020-09-27 16:00:36.279 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 115: [loading vocabulary from /rapids/notebooks/my_data/BMSTU_hack/models/vocab.dict]


<All keys matched successfully>

In [45]:
torch.cuda.empty_cache()

In [46]:
train_df = pd.read_csv('data/level0.csv')
train_labels = train_df['label']
train_df.drop(['label'], axis=1, inplace=True)
data_train_gbm = gbm_model.add_features(train_df.copy(), is_fitted=True)
data_train_rnn = train_df.to_numpy()

loading existing models...


In [47]:
test_df = pd.read_csv('data/test.csv')
test_labels = test_df['label']
test_df.drop(['label'], axis=1, inplace=True)
data_test_gbm = gbm_model.add_features(test_df.copy(), is_fitted=True)
data_test_rnn = test_df.to_numpy()

loading existing models...


In [48]:
gbm_train_proba = gbm.predict_proba(data_train_gbm)[:,1].flatten()
rnn_train_proba = biGRU_model.predict_proba(gru, vocab, data_train_rnn, device='cpu').flatten()
data_logreg_train = np.column_stack((gbm_train_proba, rnn_train_proba))

In [49]:
gbm_test_proba = gbm.predict_proba(data_test_gbm)[:,1].flatten()
rnn_test_proba = biGRU_model.predict_proba(gru, vocab, data_test_rnn, device='cpu').flatten()
data_logreg_test = np.column_stack((gbm_test_proba, rnn_test_proba))

### качество моделей по отдельности

In [55]:
print('catboost roc-auc', roc_auc_score(test_labels, gbm_test_proba))
print('GRU roc-auc \t', roc_auc_score(test_labels, rnn_test_proba))

catboost roc-auc 0.9641781672117112
GRU roc-auc 	 0.9905030433779515


In [59]:
clf = LogisticRegression(random_state=42)
clf.fit(data_logreg_train, train_labels)

LogisticRegression(random_state=42)

In [74]:
logreg_proba = clf.predict_proba(data_logreg_test)[:,1]
logreg_labels = [1 if item > 0.5 else 0 for item in logreg_proba]
print('ensemble roc-auc ', roc_auc_score(test_labels, logreg_proba))
print('ensemble f1 \t', f1_score(test_labels, logreg_labels))

ansamble roc-auc  0.9891637972489725
ansamble f1 	 0.9348180858212537


#### Ансамбль из двух моделей показал себя хуже на roc-auc, чем рекурентная нейронная сеть, но выигрывает на f1 метрике
#### По правилам хакатона модель оценивается по auc-roc, поэтому для бота будет использоваться только GRU