In [7]:
!pip3 install transformers 



In [8]:
import pandas as pd
import numpy as np
from transformers import BertTokenizerFast, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

In [3]:
!git clone https://huggingface.co/ProsusAI/finbert

Cloning into 'finbert'...
remote: Enumerating objects: 49, done.[K
remote: Counting objects: 100% (49/49), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 49 (delta 23), reused 0 (delta 0)[K
Unpacking objects: 100% (49/49), done.


In [4]:
from finbert import *

In [9]:
from torch import nn

def predict_classification(test_data, model, tokenizer):
    '''
    test_trainer: Trainer - best model for predictions
    test_data: DataFrame - columns: 'text'

    returns
    y_pred: list - predictions
    '''
    tokenized = tokenizer(test_data['text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
    outputs = model(**tokenized)
    predictions = nn.functional.softmax(outputs.logits, dim=-1)
    return predictions

In [10]:
def load_model_tokenizer(path_to_model, path_to_tokenizer):

    model = BertForSequenceClassification.from_pretrained(path_to_model)
    tokenizer = BertTokenizerFast.from_pretrained(path_to_tokenizer)

    tokenizer_sent = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    model_sent = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

    return model, tokenizer, model_sent, tokenizer_sent

In [19]:
classifier, tokenizer_classifier, model_sent, tokenizer_sent = load_model_tokenizer('/content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-1200', '/content/drive/MyDrive/Colab Notebooks/data/message_classification/tokenizer')

In [11]:
def predict_with_sentiment(dataset, classifier, tokenizer_classifier, sentiment_model, sentiment_tokenizer, decoded_predictions=True):
    '''
    dataset: DataFrame - columns: 'text', 'finance'; train+validation
    model: trained BERT best model
    tokenizer: tokenizer used in training

    returns:
    predictions: DataFrame - 
    columns: 
        text 
        finance - probability of classification
        positive - probability of positive sentiment
        neutral - -//-
        negative - -//-
    '''
    
    classification_pred = predict_classification(dataset, classifier, tokenizer_classifier)
    
    tokenized = sentiment_tokenizer(dataset['text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
    outputs = sentiment_model(**tokenized)
    sentiment_pred = nn.functional.softmax(outputs.logits, dim=-1)

    # understand sentiment outputs

    results = pd.DataFrame(columns=['text', 'finance', 'positive', 'neutral', 'negative'])
    results['text'] = dataset['text']
    # results['finance'] = classification_pred[:, 1]

    return results, classification_pred, sentiment_pred

In [12]:
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/message_classification/news_cleaned_balanced.csv')
test_data = test_data.sample(frac=1)
test_data = test_data.sample(n=20)

In [18]:
test_data = {'text': ["Later that day Apple said it was revising down its earnings expectations in \
    the fourth quarter of 2018, largely because of lower sales and signs of economic weakness in China. \
    The news rapidly infected financial markets. Apple’s share price fell by around 7% in after-hours \
    trading and the decline was extended to more than 10% when the market opened. The dollar fell \
    by 3.7% against the yen in a matter of minutes after the announcement, before rapidly recovering \
    some ground. Asian stockmarkets closed down on January 3rd and European ones opened lower. \
    Yields on government bonds fell as investors fled to the traditional haven in a market storm."]}
test_data = pd.DataFrame(test_data) 

In [20]:
results, classification_pred, sentiment_pred = predict_with_sentiment(test_data, classifier, tokenizer_classifier, model_sent, tokenizer_sent)

In [21]:
classification_pred

tensor([[0.8534, 0.1466]], grad_fn=<SoftmaxBackward>)

In [22]:
sentiment_pred

tensor([[0.0075, 0.9729, 0.0195]], grad_fn=<SoftmaxBackward>)