# Comparison of results

In [1]:
import os
import pickle

import numpy as np
import pandas as pd

from deeppavlov.core.common.file import read_json
from deeppavlov import build_model, configs, train_model
from deeppavlov.models.torch_bert.torch_transformers_classifier import TorchTransformersClassifierModel
from deeppavlov.models.preprocessors.torch_transformers_preprocessor import TorchTransformersPreprocessor
from joblib import load
from scipy.sparse import hstack
from scipy.sparse.csr import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tensorflow.keras.models import load_model

from sarcsdet.embeddings.gensim_word2vec_ruscorp import GensimWord2VecRUSEmbeddingVectorizer
from sarcsdet.embeddings.NatashaGlove import NatashaGloVeEmbeddingVectorizer
from sarcsdet.models.bilstm import BiLSTMClassifier
from sarcsdet.models.count_model_metrics import *
from sarcsdet.utils.train_utils import *
from sarcsdet.utils import chunks

[nltk_data] Downloading package punkt to /home/kate/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/kate/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /home/kate/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/kate/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


## Get data

In [2]:
df = pd.read_pickle(
    os.path.join('../data/Sarcasm_on_Reddit', 'rus-train-balanced-sarcasm-ling_feat.pkl'))

In [3]:
_, test_df = train_test_split(df, test_size=0.3, random_state=8)

## Logistic Regression

In [4]:
tfidf = load('../data/Models/reddit/tfidf_lr/tfidf.joblib')
estimator = load('../data/Models/reddit/tfidf_lr/LogisticRegression_plain.joblib')

In [5]:
x = tfidf.transform(test_df['rus_comment'].values)
preds = estimator.predict_proba(x)
lr_preds = (preds[:, 1] > 0.5).astype(int)

In [6]:
lr_test_metrics = get_test_classification_metrics(
    test_df.label.values, (preds[:, 1] > 0.5).astype(int), preds[:, 1])

## BiLSTM

In [7]:
model = BiLSTMClassifier((30, 300))
model.model = load_model('../data/Models/reddit/bilstm/NatashaGlove_BiLSTM.h5')

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [8]:
embedding_model = NatashaGloVeEmbeddingVectorizer(
    False, '../data/Embeddings/navec_hudlit_v1_12B_500K_300d_100q.tar')

In [9]:
preds = []

for chunk in tqdm(chunks(test_df['rus_comment'].values, 1024), total=test_df.index.size // 1024):
    x = embedding_model.transform(chunk)
    preds.append(model.predict_proba(x))

243it [18:17,  4.52s/it]                         


In [10]:
preds = np.concatenate(preds)
bilstm_preds = (preds > 0.61).astype(int).flatten()

In [11]:
bilstm_test_metrics = get_test_classification_metrics(
    test_df.label.values, (preds > 0.61).astype(int), preds)

## RuBert DeepPavlov

In [12]:
bert_config = read_json(configs.classifiers.rusentiment_bert)

bert_config['dataset_reader']['x'] = 'rus_comment'
bert_config['dataset_reader']['y'] = 'label'
bert_config['dataset_reader']['data_path'] = './'
bert_config['dataset_reader']['train'] = 'train.csv'
bert_config['dataset_reader']['valid'] = 'valid.csv'
bert_config['dataset_reader']['test'] = 'test.csv'

del bert_config['dataset_iterator']['split_seed']
del bert_config['dataset_iterator']['field_to_split']
del bert_config['dataset_iterator']['split_fields']
del bert_config['dataset_iterator']['split_proportions']

bert_config['metadata']['variables']['MODEL_PATH'] = '../data/Models/quotes/rubert/'

del bert_config['chainer']['pipe'][-2:]
del bert_config['chainer']['pipe'][1]
bert_config['chainer']['pipe'][1]['in'] = 'y'
bert_config['chainer']['pipe'][1]['depth'] = 2
bert_config['chainer']['pipe'][2]['n_classes'] = 2
bert_config['train']['metrics'] = [bert_config['train']['metrics'][-1]]
bert_config['chainer']['out'] = ['y_pred_probas']
bert_config['train']['epochs'] = 2
bert_config['train']['batch_size'] = 32
bert_config['train']['show_examples'] = True

vocab_file = '{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v1/vocab.txt'
bert_config_file = "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v1/bert_config.json"
pretrained_bert = "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v1/bert_model.ckpt"

bert_config['chainer']['pipe'][0]['vocab_file'] = vocab_file
bert_config['chainer']['pipe'][1]['bert_config_file'] = bert_config_file
bert_config['chainer']['pipe'][1]['pretrained_bert'] = pretrained_bert

bert_config['chainer']['pipe'][2]['bert_config_file'] = bert_config_file
bert_config['chainer']['pipe'][2]['pretrained_bert'] = pretrained_bert

In [13]:
m = build_model(bert_config)











The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.





Instructions for updating:
Use standard file APIs to check for files with this prefix.


2021-04-21 20:13:11.107 INFO in 'deeppavlov.core.models.tf_model'['tf_model'] at line 51: [loading model from /home/kate/Desktop/Sarcasm_Detection/data/Models/quotes/rubert/model]



INFO:tensorflow:Restoring parameters from /home/kate/Desktop/Sarcasm_Detection/data/Models/quotes/rubert/model


In [14]:
preds_proba = []
for batch in tqdm(chunks(test_df['rus_comment'].values, 64), total=int(test_df.index.size / 64)):
    preds_proba.append(m(batch))

preds = np.concatenate(preds_proba)
rubert_preds = (preds[:, 1] > 0.5).astype(int)

3887it [7:08:41,  6.62s/it]                            


In [15]:
rubert_test_metrics = get_test_classification_metrics(
    test_df.label.values, (preds[:, 1] > 0.5).astype(int), preds[:, 1])

## Quotes test metrics

In [16]:
pd.DataFrame.from_dict([
    lr_test_metrics,
    bilstm_test_metrics,
    rubert_test_metrics
]).rename(index={0: 'TFIDF_LR', 1: 'BILSTM', 2: 'RUBERT'})

Unnamed: 0,F1,Precision,Recall,PR_AUC,ROC_AUC
TFIDF_LR,0.695214,0.729527,0.663983,0.787075,0.767356
BILSTM,0.621239,0.731033,0.540118,0.756083,0.733181
RUBERT,0.777561,0.788927,0.766518,0.869262,0.854914


## Result dataframe

In [17]:
results = {
    'quote': test_df.rus_comment.values,
    'target': test_df.label.values,
    'tfidf_lr': lr_preds,
    'bilstm': bilstm_preds,
    'rubert': rubert_preds
}

result_df = pd.DataFrame(results, columns=results.keys())
pd.set_option('display.max_colwidth', result_df.shape[0] + 1)

In [18]:
fn = result_df[
    (result_df.target == 1) & (result_df.tfidf_lr == 0) & 
    (result_df.bilstm == 0) & (result_df.rubert == 0)
]
fn.head(15)

Unnamed: 0,quote,target,tfidf_lr,bilstm,rubert
36,Бомбил себя в живот?,1,0,0,0
37,Ну ... если честно ...,1,0,0,0
59,чем он их ударил?,1,0,0,0
73,это не все шутки?,1,0,0,0
112,Сотрудники в подавляющем большинстве используют и ... Сотрудники - шпионы,1,0,0,0
124,"да ладно, я собирался проголосовать за, но когда я прочитал ваше вдумчивое объяснение того, почему 3 будет ужасным, я мгновенно передумал.",1,0,0,0
137,Настоящая шутка здесь - нано,1,0,0,0
145,Гавел внизу с наименьшим количеством голосов:,1,0,0,0
147,"Хе-хе, в твоем текстовом чутье ""долг""",1,0,0,0
162,Свяжитесь с вашим менеджером по маркетингу.,1,0,0,0


In [19]:
fp = result_df[
    (result_df.target == 0) & (result_df.tfidf_lr == 1) & 
    (result_df.bilstm == 1) & (result_df.rubert == 1)
]
fp.head(15)

Unnamed: 0,quote,target,tfidf_lr,bilstm,rubert
27,"Нет, мы точно не были",0,1,1,1
153,"Наконец, мне не нужно тратить часы и часы на поиски тупых жетонов ...",0,1,1,1
293,"Дубья, по крайней мере, имел добрую порядочность, чтобы не демонизировать всех мусульман как зло по своей природе.",0,1,1,1
377,Это сработает,0,1,1,1
429,"Да, мужчина из Энчиладаса за 4,99 доллара ... то, что мы все хотим в стриптиз-клубе.",0,1,1,1
469,"Это крупный провал для администрации Обамы, которой никогда не следовало вмешиваться в Сирию.",0,1,1,1
502,Давайте наложим хороший пластырь на эту огромную зияющую рану,0,1,1,1
548,Это должно быть наверху!,0,1,1,1
558,"Вы хотите сказать, что если вы не являетесь всем этим, Харпер не заботится о ваших интересах?",0,1,1,1
596,Да только не биологически,0,1,1,1


In [20]:
with open('../results/rus_comment_fn.txt', 'w') as f:
    f.write('\n'.join(fn.quote.to_list()))

In [21]:
with open('../results/rus_comment_fp.txt', 'w') as f:
    f.write('\n'.join(fp.quote.to_list()))