# Google Colab Init

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# !pip install transformers==2.8.0 
# !pip install deeppavlov

# !pip uninstall -y tensorflow tensorflow-gpu
# !pip install tensorflow-gpu==1.15.2

# !python -m deeppavlov install squad_bert

# !pip uninstall -y scikit-learn
# !pip install scikit-learn

# !pip install pandas

# RuBert DeepPavlov

In [2]:
import os
import pickle
import warnings
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from deeppavlov.core.common.file import read_json
from deeppavlov import build_model, configs, train_model
from deeppavlov.models.torch_bert.torch_transformers_classifier import TorchTransformersClassifierModel
from deeppavlov.models.preprocessors.torch_transformers_preprocessor import TorchTransformersPreprocessor
from sklearn.metrics import (f1_score, precision_score, average_precision_score, roc_auc_score,
                             classification_report, plot_roc_curve, accuracy_score, make_scorer,
                             plot_precision_recall_curve, precision_recall_curve, recall_score)
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tqdm.auto import tqdm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


In [3]:
warnings.filterwarnings('ignore')

## Get data

In [4]:
data_path = '/content/drive/MyDrive/SarcasmDetection/data/Quotes'
dataname = 'rus-train-balanced-sarcasm-ling_feat.pkl'

with open(os.path.join(data_path, dataname), 'rb') as f:
    df = shuffle(pickle.load(f), random_state=8)

In [5]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=8)
train_df, valid_df = train_test_split(train_df, test_size=0.1, random_state=8)

In [6]:
train_df = train_df.groupby(
    'label', group_keys=False
).apply(lambda x: x.sample(n=(train_df.label == 0).sum())).sample(frac=1).reset_index(drop=True)

In [7]:
train_df[['rus_comment', 'label']].to_csv('train.csv', index=False)
valid_df[['rus_comment', 'label']].to_csv('valid.csv', index=False)
test_df[['rus_comment', 'label']].to_csv('test.csv', index=False)

## RuBert

In [8]:
def chunks(list_like, n):
    for i in range(0, len(list_like), n):
        yield list_like[i:i + n]

In [9]:
def show_test_classification_metrics(y_test, y_pred, y_pred_prob, X_test=None, classifier=None):
    print(f"F1: {f1_score(y_test, y_pred):.5}")
    print(f"PREC: {precision_score(y_test, y_pred):.5}")
    print(f"PR-AUC: {average_precision_score(y_test, y_pred_prob):.5}")
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_prob):.5}")
    print('-------------------------------------------------------')
    print(classification_report(y_test, y_pred, labels=[0, 1]))
    print('-------------------------------------------------------')
    if classifier:
        fig, ax = plt.subplots(1, 2, figsize=(15, 5))
        ax[0].set_title('Precision-Recall curve')
        plot_precision_recall_curve(classifier, X_test, y_test, ax=ax[0])
        ax[1].set_title('ROC-AUC curve')
        plot_roc_curve(classifier, X_test, y_test, ax=ax[1])
        plt.show()
        

In [10]:
!python -m deeppavlov download paraphraser_rubert

2021-04-13 20:51:19.614 INFO in 'deeppavlov.core.common.file'['file'] at line 32: Interpreting 'paraphraser_rubert' as '/usr/local/lib/python3.7/dist-packages/deeppavlov/configs/classifiers/paraphraser_rubert.json'
2021-04-13 20:51:21.117 INFO in 'deeppavlov.core.data.utils'['utils'] at line 94: Downloading from http://files.deeppavlov.ai/datasets/paraphraser_gold.zip?config=paraphraser_rubert to /root/.deeppavlov/downloads/paraphraser_gold.zip
100% 119k/119k [00:00<00:00, 342kB/s] 
2021-04-13 20:51:22.732 INFO in 'deeppavlov.core.data.utils'['utils'] at line 268: Extracting /root/.deeppavlov/downloads/paraphraser_gold.zip archive into /root/.deeppavlov/downloads/paraphraser_data
2021-04-13 20:51:23.830 INFO in 'deeppavlov.core.data.utils'['utils'] at line 94: Downloading from http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_v1.tar.gz?config=paraphraser_rubert to /root/.deeppavlov/downloads/rubert_cased_L-12_H-768_A-12_v1.tar.gz
100% 666M/666M [09:02<00:00, 

In [11]:
bert_config = read_json(configs.classifiers.rusentiment_bert)

bert_config['dataset_reader']['x'] = 'rus_comment'
bert_config['dataset_reader']['y'] = 'label'
bert_config['dataset_reader']['data_path'] = './'
bert_config['dataset_reader']['train'] = 'train.csv'
bert_config['dataset_reader']['valid'] = 'valid.csv'
bert_config['dataset_reader']['test'] = 'test.csv'

del bert_config['dataset_iterator']['split_seed']
del bert_config['dataset_iterator']['field_to_split']
del bert_config['dataset_iterator']['split_fields']
del bert_config['dataset_iterator']['split_proportions']

bert_config['metadata']['variables']['MODEL_PATH'] = '/content/drive/MyDrive/SarcasmDetection/data/Models/classifiers/bert_classifier_rus_comment_ling_feat/'

del bert_config['chainer']['pipe'][-2:]
del bert_config['chainer']['pipe'][1]
bert_config['chainer']['pipe'][1]['in'] = 'y'
bert_config['chainer']['pipe'][1]['depth'] = 2
bert_config['chainer']['pipe'][2]['n_classes'] = 2
bert_config['train']['metrics'] = [bert_config['train']['metrics'][-1]]
bert_config['chainer']['out'] = ['y_pred_probas']
bert_config['train']['epochs'] = 2
bert_config['train']['batch_size'] = 32
bert_config['train']['show_examples'] = True

vocab_file = '{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v1/vocab.txt'
bert_config_file = "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v1/bert_config.json"
pretrained_bert = "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v1/bert_model.ckpt"

bert_config['chainer']['pipe'][0]['vocab_file'] = vocab_file
bert_config['chainer']['pipe'][1]['bert_config_file'] = bert_config_file
bert_config['chainer']['pipe'][1]['pretrained_bert'] = pretrained_bert

bert_config['chainer']['pipe'][2]['bert_config_file'] = bert_config_file
bert_config['chainer']['pipe'][2]['pretrained_bert'] = pretrained_bert

In [12]:
m = train_model(bert_config)

2021-04-13 21:11:34.20 INFO in 'deeppavlov.core.trainers.fit_trainer'['fit_trainer'] at line 68: NNTrainer got additional init parameters ['pytest_max_batches'] that will be ignored:













The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.





Instructions for updating:
Use standard file APIs to check for files with this prefix.


2021-04-13 21:12:00.835 INFO in 'deeppavlov.models.bert.bert_classifier'['bert_classifier'] at line 99: [initializing model with Bert from /root/.deeppavlov/downloads/bert_models/rubert_cased_L-12_H-768_A-12_v1/bert_model.ckpt]



INFO:tensorflow:Restoring parameters from /root/.deeppavlov/downloads/bert_models/rubert_cased_L-12_H-768_A-12_v1/bert_model.ckpt


2021-04-13 21:16:09.104 INFO in 'deeppavlov.core.trainers.nn_trainer'['nn_trainer'] at line 199: Initial best roc_auc of 0.4888


{"valid": {"eval_examples_count": 58035, "metrics": {"roc_auc": 0.4888}, "time_spent": "0:04:07", "examples": [{"x": "Пока не сделает этого, продолжайте использовать :)", "y_predicted": [0.5681143403053284, 0.4318857192993164], "y_true": "0"}, {"x": "Понимание прочитанного затруднено.", "y_predicted": [0.5857866406440735, 0.4142133593559265], "y_true": "1"}, {"x": "Ах да, Франция это заслужила.", "y_predicted": [0.5963402390480042, 0.40365976095199585], "y_true": "1"}, {"x": "Да, все, что ей нужно было сделать, это построить быстрый навес, не такой уж трудный для двухлетнего ребенка.", "y_predicted": [0.575062096118927, 0.424937903881073], "y_true": "1"}, {"x": "Ты о чём хрен?", "y_predicted": [0.5978941321372986, 0.4021058976650238], "y_true": "0"}, {"x": "Христианство предшествовало католицизму.", "y_predicted": [0.5743672847747803, 0.4256327450275421], "y_true": "0"}, {"x": "Да, иногда они действительно надоедают!", "y_predicted": [0.5849998593330383, 0.41500017046928406], "y_true":

2021-04-13 23:23:36.90 INFO in 'deeppavlov.core.trainers.nn_trainer'['nn_trainer'] at line 207: Improved best roc_auc of 0.825
2021-04-13 23:23:36.91 INFO in 'deeppavlov.core.trainers.nn_trainer'['nn_trainer'] at line 209: Saving model
2021-04-13 23:23:36.93 INFO in 'deeppavlov.core.models.tf_model'['tf_model'] at line 75: [saving model to /content/drive/MyDrive/SarcasmDetection/data/Models/classifiers/bert_classifier_rus_comment_ling_feat/model]


{"valid": {"eval_examples_count": 58035, "metrics": {"roc_auc": 0.825}, "time_spent": "2:11:34", "examples": [{"x": "Пока не сделает этого, продолжайте использовать :)", "y_predicted": [0.8868727087974548, 0.11312723159790039], "y_true": "0"}, {"x": "Понимание прочитанного затруднено.", "y_predicted": [0.8316699862480164, 0.16832995414733887], "y_true": "1"}, {"x": "Ах да, Франция это заслужила.", "y_predicted": [0.2389296293258667, 0.7610704302787781], "y_true": "1"}, {"x": "Да, все, что ей нужно было сделать, это построить быстрый навес, не такой уж трудный для двухлетнего ребенка.", "y_predicted": [0.14246276021003723, 0.8575372695922852], "y_true": "1"}, {"x": "Ты о чём хрен?", "y_predicted": [0.8077859282493591, 0.1922140270471573], "y_true": "0"}, {"x": "Христианство предшествовало католицизму.", "y_predicted": [0.6026341915130615, 0.3973657786846161], "y_true": "0"}, {"x": "Да, иногда они действительно надоедают!", "y_predicted": [0.19981873035430908, 0.8001812100410461], "y_tru

2021-04-14 01:31:02.574 INFO in 'deeppavlov.core.trainers.nn_trainer'['nn_trainer'] at line 207: Improved best roc_auc of 0.8286
2021-04-14 01:31:02.577 INFO in 'deeppavlov.core.trainers.nn_trainer'['nn_trainer'] at line 209: Saving model
2021-04-14 01:31:02.586 INFO in 'deeppavlov.core.models.tf_model'['tf_model'] at line 75: [saving model to /content/drive/MyDrive/SarcasmDetection/data/Models/classifiers/bert_classifier_rus_comment_ling_feat/model]


{"valid": {"eval_examples_count": 58035, "metrics": {"roc_auc": 0.8286}, "time_spent": "4:19:01", "examples": [{"x": "Пока не сделает этого, продолжайте использовать :)", "y_predicted": [0.9018629789352417, 0.09813708811998367], "y_true": "0"}, {"x": "Понимание прочитанного затруднено.", "y_predicted": [0.851266622543335, 0.14873331785202026], "y_true": "1"}, {"x": "Ах да, Франция это заслужила.", "y_predicted": [0.26253488659858704, 0.7374650835990906], "y_true": "1"}, {"x": "Да, все, что ей нужно было сделать, это построить быстрый навес, не такой уж трудный для двухлетнего ребенка.", "y_predicted": [0.10847938805818558, 0.891520619392395], "y_true": "1"}, {"x": "Ты о чём хрен?", "y_predicted": [0.923386812210083, 0.07661320269107819], "y_true": "0"}, {"x": "Христианство предшествовало католицизму.", "y_predicted": [0.6410895586013794, 0.3589105010032654], "y_true": "0"}, {"x": "Да, иногда они действительно надоедают!", "y_predicted": [0.09134109318256378, 0.9086588621139526], "y_tru

2021-04-14 01:31:26.45 INFO in 'deeppavlov.core.models.tf_model'['tf_model'] at line 51: [loading model from /content/drive/MyDrive/SarcasmDetection/data/Models/classifiers/bert_classifier_rus_comment_ling_feat/model]


INFO:tensorflow:Restoring parameters from /content/drive/MyDrive/SarcasmDetection/data/Models/classifiers/bert_classifier_rus_comment_ling_feat/model
{"train": {"eval_examples_count": 501968, "metrics": {"roc_auc": 0.896}, "time_spent": "0:36:09", "examples": [{"x": "Значит, это вопрос поведения, а не политики?", "y_predicted": [0.9038836359977722, 0.09611629694700241], "y_true": "0"}, {"x": "Английский второй язык?", "y_predicted": [0.6564089059829712, 0.3435910642147064], "y_true": "1"}, {"x": "Они были как дерьмо в 7,3", "y_predicted": [0.8667333126068115, 0.13326674699783325], "y_true": "0"}, {"x": "Думаю, это доказывает, что они больше не лучшие друзья", "y_predicted": [0.31043121218681335, 0.689568817615509], "y_true": "1"}, {"x": "Может, тебе стоит выбраться из гребаного подвала Алреди?", "y_predicted": [0.35400623083114624, 0.6459937691688538], "y_true": "1"}, {"x": "Это БМД, БТР-Д и НОНА, а не танки.", "y_predicted": [0.5736340284347534, 0.42636603116989136], "y_true": "0"}, {

2021-04-14 02:30:00.205 INFO in 'deeppavlov.core.models.tf_model'['tf_model'] at line 51: [loading model from /content/drive/MyDrive/SarcasmDetection/data/Models/classifiers/bert_classifier_rus_comment_ling_feat/model]


INFO:tensorflow:Restoring parameters from /content/drive/MyDrive/SarcasmDetection/data/Models/classifiers/bert_classifier_rus_comment_ling_feat/model


In [14]:
preds_proba = []
for batch in tqdm(chunks(test_df['rus_comment'].values, 64), total=int(test_df.index.size / 64)):
    preds_proba.append(m(batch))

preds = np.concatenate(preds_proba)

show_test_classification_metrics(
    test_df.label.values, 
    (preds[:, 1] > 0.5).astype(int), 
    preds[:, 1]
)

HBox(children=(FloatProgress(value=0.0, max=3886.0), HTML(value='')))


F1: 0.74173
PREC: 0.7785
PR-AUC: 0.84328
ROC-AUC: 0.82521
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.71      0.78      0.75    119781
           1       0.78      0.71      0.74    128940

    accuracy                           0.74    248721
   macro avg       0.75      0.75      0.74    248721
weighted avg       0.75      0.74      0.74    248721

-------------------------------------------------------
