## Loading libraries, connecting to Google Drive

In [1]:
!pip3 install transformers 

import pandas as pd
import numpy as np
from transformers import BertTokenizerFast, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 7.7MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 35.5MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [60]:
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 1000)

## Functions for predictions

In [4]:
import torch

class BertDataset(torch.utils.data.Dataset):   
    def __init__(self, encodings, labels=None):          
        self.encodings = encodings        
        self.labels = labels
     
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.encodings["input_ids"])

In [44]:
from torch.nn.functional import softmax
import re
import string

remove_punct_map = dict.fromkeys(map(ord, string.punctuation))
def clean_string(s):
    s = re.sub(r'\@\w+|\@', ' ', s)
    s = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', s)
    s = re.sub('[^a-zA-Z]', ' ', s)
    s = re.sub(r'[0-9]+', ' ', s)
    s = s.translate(remove_punct_map).lower()
    s = re.sub(' +', ' ', s)
    return s

def clean_data(data):
    return data['text'].apply(clean_string)

def load_model_tokenizer(path_to_model, path_to_tokenizer):

    model = BertForSequenceClassification.from_pretrained(path_to_model)
    tokenizer = BertTokenizerFast.from_pretrained(path_to_tokenizer)

    tokenizer_sent = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    model_sent = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

    return model, tokenizer, model_sent, tokenizer_sent


def predict_classification(test_data, model, tokenizer):
    '''
    test_trainer: Trainer - best model for predictions
    test_data: DataFrame - columns: 'text'

    returns
    y_pred: list - predictions
    '''
    tokenized = tokenizer(test_data['text'].tolist(), padding=True, truncation=True, max_length=512)
    test_trainer = Trainer(model) 
    raw_pred, _, _ = test_trainer.predict(BertDataset(tokenized, None)) 
    predictions = softmax(torch.from_numpy(np.array(raw_pred))).detach().numpy()
    return predictions


def predict_with_sentiment(dataset, classifier, tokenizer_classifier, sentiment_model, sentiment_tokenizer, decoded_predictions=True):
    '''
    dataset: DataFrame - columns: 'text', 'finance'; train+validation
    model: trained BERT best model
    tokenizer: tokenizer used in training

    returns:
    predictions: DataFrame - 
    columns: 
        text 
        finance - probability of classification
        positive - probability of positive sentiment
        neutral - -//-
        negative - -//-
    '''
    
    classification_pred = predict_classification(dataset, classifier, tokenizer_classifier)
    
    tokenized = sentiment_tokenizer(dataset['text'].tolist(), padding=True, truncation=True, max_length=512)
    
    trainer = Trainer(sentiment_model)
    raw_pred, _, _ = trainer.predict(BertDataset(tokenized, None))
    sentiment_pred = softmax(torch.from_numpy(np.array(raw_pred))).detach().numpy()

    # understand sentiment outputs

    results = pd.DataFrame(columns=['text', 'finance_proba', 'positive', 'neutral', 'negative'])
    results['text'] = dataset['text']
    results['finance_proba'] = classification_pred[:, 1]
    results['positive'] = sentiment_pred[:, 0]
    results['negative'] = sentiment_pred[:, 1]
    results['neutral'] = sentiment_pred[:, 2]

    return results

## Models loading

In [7]:
classifier, tokenizer_classifier, model_sent, tokenizer_sent = load_model_tokenizer('/content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-4200', 'bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=252.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=758.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=437992753.0, style=ProgressStyle(descri…




## Prediction

In [57]:
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/message_classification/dataset_for_testing_v2.csv')

In [58]:
test_data_batch = test_data.sample(n=200)

In [59]:
results = predict_with_sentiment(test_data_batch, classifier, tokenizer_classifier, model_sent, tokenizer_sent)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 200
  Batch size = 8


No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 200
  Batch size = 8




In [61]:
results

Unnamed: 0,text,finance_proba,positive,neutral,negative
9634,ted is the only possibility to stop trump says...,0.995019,0.054270,0.900015,0.045715
9648,britain is currently only launching strikes ag...,0.000156,0.040631,0.449064,0.510305
106,in fiskars cash flow from operating activities...,0.995056,0.950055,0.023019,0.026926
1446,ulefos group is the leading supplier of manhol...,0.995006,0.711260,0.282133,0.006607
1513,currently yit builds a housing estate zapadnye...,0.995043,0.086874,0.903513,0.009613
...,...,...,...,...,...
7684,a defeat in saturdays election would keep the ...,0.994883,0.067182,0.295905,0.636912
4960,she leads among those who say they arent plann...,0.005780,0.060370,0.770263,0.169367
2766,activities range from the development of natur...,0.995047,0.035639,0.949661,0.014700
7664,hillary clinton looks at this through the lens...,0.995009,0.071151,0.914571,0.014279


Evaluating model

In [62]:
from sklearn.metrics import classification_report

threshold = 0.5
y_pred = [1 if results.iloc[i]['finance_proba'] > threshold else 0 for i in range(results.shape[0])]
y_test = test_data_batch['finance'].map(int)

print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

           0       0.80      0.36      0.50       102
           1       0.58      0.91      0.71        98

    accuracy                           0.63       200
   macro avg       0.69      0.64      0.60       200
weighted avg       0.69      0.63      0.60       200

