# Prediction for Twitter data

In [9]:
import os

import numpy as np
import pandas as pd

from deeppavlov.core.common.file import read_json
from deeppavlov import build_model, configs, train_model
from deeppavlov.models.torch_bert.torch_transformers_classifier import TorchTransformersClassifierModel
from deeppavlov.models.preprocessors.torch_transformers_preprocessor import TorchTransformersPreprocessor
from joblib import load
from tensorflow.keras.models import load_model

from sarcsdet.models.count_model_metrics import *
from sarcsdet.utils.train_utils import *

[nltk_data] Downloading package punkt to /home/kate/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/kate/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /home/kate/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/kate/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


### Get data

In [10]:
data_path = '../data/Twitter'

In [11]:
df = pd.read_csv(os.path.join(data_path, 'twitter_tokenized.csv'), index_col='id')

## Best model Quotes

In [12]:
bert_config = read_json(configs.classifiers.rusentiment_bert)

bert_config['dataset_reader']['x'] = 'quote'
bert_config['dataset_reader']['y'] = 'target'
bert_config['dataset_reader']['data_path'] = './'
bert_config['dataset_reader']['train'] = 'train.csv'
bert_config['dataset_reader']['valid'] = 'valid.csv'
bert_config['dataset_reader']['test'] = 'test.csv'

del bert_config['dataset_iterator']['split_seed']
del bert_config['dataset_iterator']['field_to_split']
del bert_config['dataset_iterator']['split_fields']
del bert_config['dataset_iterator']['split_proportions']

bert_config['metadata']['variables']['MODEL_PATH'] = '../data/Models/quotes/rubert/'

del bert_config['chainer']['pipe'][-2:]
del bert_config['chainer']['pipe'][1]
bert_config['chainer']['pipe'][1]['in'] = 'y'
bert_config['chainer']['pipe'][1]['depth'] = 2
bert_config['chainer']['pipe'][2]['n_classes'] = 2
bert_config['train']['metrics'] = [bert_config['train']['metrics'][-1]]
bert_config['chainer']['out'] = ['y_pred_probas']
bert_config['train']['epochs'] = 2
bert_config['train']['batch_size'] = 32
bert_config['train']['show_examples'] = True

vocab_file = '{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v1/vocab.txt'
bert_config_file = "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v1/bert_config.json"
pretrained_bert = "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v1/bert_model.ckpt"

bert_config['chainer']['pipe'][0]['vocab_file'] = vocab_file
bert_config['chainer']['pipe'][1]['bert_config_file'] = bert_config_file
bert_config['chainer']['pipe'][1]['pretrained_bert'] = pretrained_bert

bert_config['chainer']['pipe'][2]['bert_config_file'] = bert_config_file
bert_config['chainer']['pipe'][2]['pretrained_bert'] = pretrained_bert

In [13]:
m = build_model(bert_config)











The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.





Instructions for updating:
Use standard file APIs to check for files with this prefix.


2021-04-27 19:28:42.527 INFO in 'deeppavlov.core.models.tf_model'['tf_model'] at line 51: [loading model from /home/kate/Desktop/Sarcasm_Detection/data/Models/quotes/rubert/model]



INFO:tensorflow:Restoring parameters from /home/kate/Desktop/Sarcasm_Detection/data/Models/quotes/rubert/model


In [15]:
preds_proba = []
for batch in tqdm(chunks(df['quote'].values, 64), total=int(df.index.size / 64)):
    preds_proba.append(m(batch))

preds = np.concatenate(preds_proba)
rubert_preds = (preds[:, 1] > 0.5).astype(int)

3it [00:17,  5.84s/it]                       


In [20]:
sum(rubert_preds) / len(preds)

0.436241610738255

## Best model SARC

In [25]:
bert_config = read_json(configs.classifiers.rusentiment_bert)

bert_config['dataset_reader']['x'] = 'rus_comment'
bert_config['dataset_reader']['y'] = 'label'
bert_config['dataset_reader']['data_path'] = './'
bert_config['dataset_reader']['train'] = 'train.csv'
bert_config['dataset_reader']['valid'] = 'valid.csv'
bert_config['dataset_reader']['test'] = 'test.csv'

del bert_config['dataset_iterator']['split_seed']
del bert_config['dataset_iterator']['field_to_split']
del bert_config['dataset_iterator']['split_fields']
del bert_config['dataset_iterator']['split_proportions']

bert_config['metadata']['variables']['MODEL_PATH'] = '../data/Models/reddit/rubert/'

del bert_config['chainer']['pipe'][-2:]
del bert_config['chainer']['pipe'][1]
bert_config['chainer']['pipe'][1]['in'] = 'y'
bert_config['chainer']['pipe'][1]['depth'] = 2
bert_config['chainer']['pipe'][2]['n_classes'] = 2
bert_config['train']['metrics'] = [bert_config['train']['metrics'][-1]]
bert_config['chainer']['out'] = ['y_pred_probas']
bert_config['train']['epochs'] = 2
bert_config['train']['batch_size'] = 32
bert_config['train']['show_examples'] = True

vocab_file = '{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v1/vocab.txt'
bert_config_file = "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v1/bert_config.json"
pretrained_bert = "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v1/bert_model.ckpt"

bert_config['chainer']['pipe'][0]['vocab_file'] = vocab_file
bert_config['chainer']['pipe'][1]['bert_config_file'] = bert_config_file
bert_config['chainer']['pipe'][1]['pretrained_bert'] = pretrained_bert

bert_config['chainer']['pipe'][2]['bert_config_file'] = bert_config_file
bert_config['chainer']['pipe'][2]['pretrained_bert'] = pretrained_bert

In [26]:
m = build_model(bert_config)

2021-04-27 19:55:21.184 INFO in 'deeppavlov.core.models.tf_model'['tf_model'] at line 51: [loading model from /home/kate/Desktop/Sarcasm_Detection/data/Models/reddit/rubert/model]


INFO:tensorflow:Restoring parameters from /home/kate/Desktop/Sarcasm_Detection/data/Models/reddit/rubert/model


In [27]:
preds_proba = []
for batch in tqdm(chunks(df['quote'].values, 64), total=int(df.index.size / 64)):
    preds_proba.append(m(batch))

preds = np.concatenate(preds_proba)
rubert_preds = (preds[:, 1] > 0.5).astype(int)

3it [00:15,  5.10s/it]                       


In [28]:
sum(rubert_preds) / len(preds)

0.5167785234899329