In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

import torch 
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("data/news/bitcoin.csv")
df = df[["date", "title", "text"]]
df.head()

Unnamed: 0,date,title,text
0,"Thu, 21 Apr 2022 10:25:31 -0400",Mexican Football Club Tigres Now Accepts Bitco...,Tigres supporters can purchase match tickets w...
1,"Thu, 21 Apr 2022 10:14:42 -0400","SFLMaven To Accept Bitcoin As Payment, Add BTC...",Luxury jewelry reseller SFLMaven announced the...
2,"Thu, 21 Apr 2022 09:37:51 -0400","Bitcoin, Ethereum Technical Analysis: BTC up t...","BTC was once again trading higher, as bullish ..."
3,"Wed, 20 Apr 2022 20:30:00 -0400",Bitcoin And A World Of Rules Without Rulers,Bitcoin offers an opportunity for society to m...
4,"Wed, 20 Apr 2022 18:05:58 -0400",U.S Treasury Sanctions Russian Bitcoin Miners,The Treasury department announced sanctions ag...


In [3]:
def process_batch(batch):
  tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
  modelInput = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
  return model(**modelInput)

inputs = df["title"] + " " + df["text"]
inputs = inputs.tolist()

model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

step = 100
results = np.empty((0, 3))
for i in range(0, len(inputs)-step, step):
  batch = inputs[i:i+step]
  outputs = process_batch(batch)
  results = np.vstack((results, torch.nn.functional.softmax(outputs.logits, dim = -1).detach().numpy()))

print(len(results))
print(len(inputs) - i)

# don't forget the remaining data from inputs
outputs = process_batch(inputs[i+step:])
results = np.vstack((results, torch.nn.functional.softmax(outputs.logits, dim = -1).detach().numpy()))

3500
126


In [4]:
model_labels = model.config.id2label

results_sentiment_confidence = np.max(results, axis=1)
results_label_indices = np.argmax(results, axis=1)
results_labelled = list(map(lambda x : model_labels[x], results_label_indices))

df["sentiment"] = results_labelled
df["sentiment confidence"] = results_sentiment_confidence

df.to_csv("results_BERT.csv")