In [1]:
!pip3 install transformers 

import pandas as pd
import numpy as np
from transformers import BertTokenizerFast, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments



In [37]:
import torch

class BertDataset(torch.utils.data.Dataset):   
    def __init__(self, encodings, labels=None):          
        self.encodings = encodings        
        self.labels = labels
     
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.encodings["input_ids"])

In [43]:
from scipy.special import softmax

def load_model_tokenizer(path_to_model, path_to_tokenizer):

    model = BertForSequenceClassification.from_pretrained(path_to_model)
    tokenizer = BertTokenizerFast.from_pretrained(path_to_tokenizer)

    tokenizer_sent = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    model_sent = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

    return model, tokenizer, model_sent, tokenizer_sent


def predict_classification(test_data, model, tokenizer):
    '''
    test_trainer: Trainer - best model for predictions
    test_data: DataFrame - columns: 'text'

    returns
    y_pred: list - predictions
    '''
    tokenized = tokenizer(test_data['text'].tolist(), padding=True, truncation=True, max_length=512)
    test_trainer = Trainer(model) 
    raw_pred, _, _ = test_trainer.predict(BertDataset(tokenized, None)) 
    predictions = softmax(raw_pred)
    return predictions


def predict_with_sentiment(dataset, classifier, tokenizer_classifier, sentiment_model, sentiment_tokenizer, decoded_predictions=True):
    '''
    dataset: DataFrame - columns: 'text', 'finance'; train+validation
    model: trained BERT best model
    tokenizer: tokenizer used in training

    returns:
    predictions: DataFrame - 
    columns: 
        text 
        finance - probability of classification
        positive - probability of positive sentiment
        neutral - -//-
        negative - -//-
    '''
    
    classification_pred = predict_classification(dataset, classifier, tokenizer_classifier)
    
    tokenized = sentiment_tokenizer(dataset['text'].tolist(), padding=True, truncation=True, max_length=512)
    
    trainer = Trainer(sentiment_model)
    raw_pred, _, _ = trainer.predict(BertDataset(tokenized, None))
    sentiment_pred = softmax(raw_pred)

    # understand sentiment outputs

    results = pd.DataFrame(columns=['text', 'finance', 'positive', 'neutral', 'negative'])
    results['text'] = dataset['text']
    # results['finance'] = classification_pred[:, 1]

    return results, classification_pred, sentiment_pred

In [6]:
classifier, tokenizer_classifier, model_sent, tokenizer_sent = load_model_tokenizer('/content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-700', 'bert-base-uncased')

In [86]:
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/message_classification/tiny_test_data.csv')
# test_data = test_data.sample(frac=1)

In [87]:
# test_data_batch = test_data.sample(n=100)
test_data_batch = test_data

In [88]:
results, classification_pred, sentiment_pred = predict_with_sentiment(test_data_batch, classifier, tokenizer_classifier, model_sent, tokenizer_sent)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 4
  Batch size = 8


No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 4
  Batch size = 8


In [89]:
classification_pred

array([[3.1994109e-04, 1.7064339e-01],
       [1.6339759e-04, 3.9258066e-01],
       [2.3938485e-03, 1.3213388e-02],
       [4.2058048e-01, 1.0506948e-04]], dtype=float32)

In [90]:
y_pred = np.argmax(classification_pred, axis=1)

In [91]:
y_test = test_data_batch['finance'].map(int)

In [92]:
y_pred

array([1, 1, 1, 0])

In [93]:
from sklearn.metrics import classification_report

print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.67      1.00      0.80         2

    accuracy                           0.75         4
   macro avg       0.83      0.75      0.73         4
weighted avg       0.83      0.75      0.73         4



In [94]:
sentiment_pred

array([[0.01307078, 0.0396    , 0.09352222],
       [0.02233841, 0.00364368, 0.27937737],
       [0.02086242, 0.0071566 , 0.24362586],
       [0.00836449, 0.02399694, 0.24444129]], dtype=float32)

In [54]:
test_data_batch

Unnamed: 0.1,Unnamed: 0,text,finance
4371,4371,border guard service has banned the mooring of...,1
5936,5936,they have always been bjp supporters tharoor r...,0
460,460,to be number one means creating added value fo...,1
5474,5474,obvious that they won answer some questions l...,0
2621,2621,the deal is subject to approval by the norwegi...,1
9587,9587,modi finally talks about ambani adani and kejr...,0
7609,7609,better then our australian space program which...,0
8844,8844,regret using all vacation wait getting hit ca...,0
264,264,press release october ruukki has signed a cont...,1
3016,3016,adp news nov finnish paper packaging and fore...,1
