# RuBERT with extra features

In [1]:
import os
import pickle
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from collections import OrderedDict
from scikitplot.metrics import plot_precision_recall_curve, plot_roc_curve
from sklearn.metrics import (f1_score, precision_score, average_precision_score, roc_auc_score,
                             classification_report, accuracy_score, make_scorer,
                             precision_recall_curve, recall_score)
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tqdm.auto import tqdm

from sarcsdet.utils.train_utils import *

from deeppavlov.core.common.file import read_json
from deeppavlov import build_model, configs, train_model
from deeppavlov.models.torch_bert.torch_transformers_classifier import TorchTransformersClassifierModel
from deeppavlov.models.preprocessors.torch_transformers_preprocessor import TorchTransformersPreprocessor

[nltk_data] Downloading package punkt to /home/ms/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ms/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to /home/ms/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/ms/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


## Get data

In [2]:
df = pd.read_pickle('../data/Sarcasm_on_Reddit/rus-train-balanced-sarcasm-ling_feat.pkl')

In [3]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=8)
train_df, valid_df = train_test_split(train_df, test_size=0.1, random_state=8)

## Additional functions

In [4]:
def show_test_classification_metrics(y_test, y_pred, y_pred_prob, X_test=None, classifier=None):
    print(f"F1: {f1_score(y_test, y_pred):.5}")
    print(f"PREC: {precision_score(y_test, y_pred):.5}")
    print(f"PR-AUC: {average_precision_score(y_test, y_pred_prob):.5}")
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_prob):.5}")
    print('-------------------------------------------------------')
    print(classification_report(y_test, y_pred, labels=[0, 1]))
    print('-------------------------------------------------------')
    if classifier:
        fig, ax = plt.subplots(1, 2, figsize=(15, 5))
        ax[0].set_title('Precision-Recall curve')
        plot_precision_recall_curve(classifier, X_test, y_test, ax=ax[0])
        ax[1].set_title('ROC-AUC curve')
        plot_roc_curve(classifier, X_test, y_test, ax=ax[1])
        plt.show()
        

In [5]:
def get_bert_preds(df, bert_model):
    preds_proba = []
    for batch in tqdm(chunks(df['rus_comment'].values, 64), total=int(df.index.size / 64)):
        preds_proba.append(bert_model(batch))

    preds = np.concatenate(preds_proba)
    return preds

## Extra features

In [6]:
features = [
  'score', 'author', 'subreddit',
  'exclamation', 'question', 'quotes', 'dotes', 
  'funny_mark', 'interjections'
  ]

X = train_df[features].values
y = train_df['label'].values

test_X = test_df[features].values
test_y = test_df['label'].values

valid_X = valid_df[features].values
valid_y = valid_df['label'].values

In [7]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(8, activation='relu', input_shape=(X.shape[1], )))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=[tf.keras.metrics.AUC()])

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [8]:
model.fit(X, y, validation_data=(valid_X, valid_y), epochs=10, workers=-1)

Train on 522314 samples, validate on 58035 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0cd3378e90>

In [9]:
preds = model.predict(test_X)

## RuBERT

In [10]:
bert_config = read_json(configs.classifiers.rusentiment_bert)

bert_config['dataset_reader']['x'] = 'rus_comment'
bert_config['dataset_reader']['y'] = 'label'
bert_config['dataset_reader']['data_path'] = './'
bert_config['dataset_reader']['train'] = 'train.csv'
bert_config['dataset_reader']['valid'] = 'valid.csv'
bert_config['dataset_reader']['test'] = 'test.csv'

del bert_config['dataset_iterator']['split_seed']
del bert_config['dataset_iterator']['field_to_split']
del bert_config['dataset_iterator']['split_fields']
del bert_config['dataset_iterator']['split_proportions']

bert_config['metadata']['variables']['MODEL_PATH'] = '../data/Models/reddit/rubert/'

del bert_config['chainer']['pipe'][-2:]
del bert_config['chainer']['pipe'][1]
bert_config['chainer']['pipe'][1]['in'] = 'y'
bert_config['chainer']['pipe'][1]['depth'] = 2
bert_config['chainer']['pipe'][2]['n_classes'] = 2
bert_config['train']['metrics'] = [bert_config['train']['metrics'][-1]]
bert_config['chainer']['out'] = ['y_pred_probas']
bert_config['train']['epochs'] = 2
bert_config['train']['batch_size'] = 32
bert_config['train']['show_examples'] = True

vocab_file = '{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v1/vocab.txt'
bert_config_file = "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v1/bert_config.json"
pretrained_bert = "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v1/bert_model.ckpt"

bert_config['chainer']['pipe'][0]['vocab_file'] = vocab_file
bert_config['chainer']['pipe'][1]['bert_config_file'] = bert_config_file
bert_config['chainer']['pipe'][1]['pretrained_bert'] = pretrained_bert

bert_config['chainer']['pipe'][2]['bert_config_file'] = bert_config_file
bert_config['chainer']['pipe'][2]['pretrained_bert'] = pretrained_bert

bert_model = build_model(bert_config)











The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.





Instructions for updating:
Use standard file APIs to check for files with this prefix.
2021-04-28 02:51:01.240 INFO in 'deeppavlov.core.models.tf_model'['tf_model'] at line 51: [loading model from /home/ms/Desktop/kate/Sarcasm_Detection/data/Models/reddit/rubert/model]

INFO:tensorflow:Restoring parameters from /home/ms/Desktop/kate/Sarcasm_Detection/data/Models/reddit/rube

In [11]:
bert_X = get_bert_preds(train_df, bert_model)
bert_valid_X = get_bert_preds(valid_df, bert_model)
bert_test_X = get_bert_preds(test_df, bert_model)

bert_X = bert_X[:, 0]
bert_valid_X = bert_valid_X[:, 0]
bert_test_X = bert_test_X[:, 0]

8162it [5:47:05,  2.55s/it]
907it [38:23,  2.54s/it]
3887it [2:47:48,  2.59s/it]


## Results

In [12]:
XX = np.concatenate([bert_X.reshape((-1, 1)), model.predict(X)], axis=1)
valid_XX = np.concatenate([bert_valid_X.reshape((-1, 1)), model.predict(valid_X)], axis=1)
test_XX = np.concatenate([bert_test_X.reshape((-1, 1)), model.predict(test_X)], axis=1)

In [13]:
model2 = tf.keras.Sequential()
model2.add(tf.keras.layers.Dense(10, activation='relu', input_shape=(2, )))
model2.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model2.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=[tf.keras.metrics.AUC()])

In [14]:
model2.fit(XX, y, validation_data=(valid_XX, valid_y), epochs=10, workers=-1)

Train on 522314 samples, validate on 58035 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0bd74299d0>

In [15]:
preds = model2.predict(test_XX)

show_test_classification_metrics(
    test_y, 
    (preds > 0.55).astype(int), 
    preds
)

F1: 0.78267
PREC: 0.82391
PR-AUC: 0.88092
ROC-AUC: 0.86852
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.75      0.83      0.79    119653
           1       0.82      0.75      0.78    129068

    accuracy                           0.79    248721
   macro avg       0.79      0.79      0.79    248721
weighted avg       0.79      0.79      0.79    248721

-------------------------------------------------------
