In [1]:
!pip install transformers



In [16]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [17]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [24]:
df = pd.read_excel("data_financial reports.xlsx")

In [69]:
# convert them to a normal Python list containing the headlines.
df_array = np.array(df.iloc[:,4])

In [70]:
content_list = list(df_array.astype(str))

In [72]:
max_length = 512
def get_sentiment(text):
    # Split the input text into smaller chunks of maximum sequence length
    chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
    # Initialize an empty list to store the sentiment scores for each chunk
    scores = []
    for chunk in chunks:
        # Encode the chunk using the tokenizer
        inputs = tokenizer(chunk, return_tensors='pt', truncation=False, padding=True)
        # Use the model to predict the sentiment
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        scores.append(probs)
    return scores

In [73]:
def get_mean_from_proba(proba_list):
    with torch.no_grad():
        stacks = torch.stack(proba_list)
        stacks = stacks.resize(stacks.shape[0], stacks.shape[2] )
        mean = stacks.mean(dim = 0)
    return mean

In [74]:
#loop through all sentence in the data
lst_tensor = []
for i in range(0, len(content_list),1):
    sentiment_score = get_sentiment(content_list[i])
    stacks = torch.stack(sentiment_score)
    shape = stacks.shape
    torch.reshape(stacks, (shape[0], shape[2]))
    mean = get_mean_from_proba(sentiment_score)
    lst_tensor.append(mean.tolist())



In [75]:
sentiment_df = pd.DataFrame(lst_tensor, columns = [ "Positive", "Negative", "Neutral"])
sentiment_df

Unnamed: 0,Positive,Negative,Neutral
0,0.019599,0.035842,0.944560
1,0.033025,0.181488,0.785487
2,0.019108,0.039280,0.941612
3,0.045987,0.030162,0.923851
4,0.022585,0.038042,0.939373
...,...,...,...
3716,0.450548,0.060577,0.488875
3717,0.040637,0.157746,0.801617
3718,0.025207,0.364944,0.609849
3719,0.042157,0.149861,0.807983


In [83]:
final_df = pd.concat([df.iloc[:,:6],sentiment_df] ,axis=1)

In [84]:
final_df

Unnamed: 0,date,stock_name,report_type,report_section,content,Note,Positive,Negative,Neutral
0,2018-02-01,AAPL,8-K,8-K|2.02,"On February 1, 2018, Apple Inc. (“Apple”) issu...",,0.019599,0.035842,0.944560
1,2018-02-14,AAPL,8-K,8-K|5.07,The Annual Meeting of Shareholders of Apple In...,,0.033025,0.181488,0.785487
2,2018-05-01,AAPL,8-K,8-K|2.02,"On May 1, 2018, Apple Inc. (“Apple”) issued a ...",,0.019108,0.039280,0.941612
3,2018-05-07,AAPL,8-K,8-K|8.01,"On April 30, 2018, the Superior Court of the S...",,0.045987,0.030162,0.923851
4,2018-05-08,AAPL,8-K,8-K|8.01,"On May 7, 2018, Apple Inc. filed a Current Rep...",,0.022585,0.038042,0.939373
...,...,...,...,...,...,...,...,...,...
3716,2023-03-17,WMT,10-K,10-K|Part2|7,The Walmart U.S. segment comprises the largest...,Inventories,0.450548,0.060577,0.488875
3717,2023-03-17,WMT,10-K,10-K|Part2|7,Intangible assets acquired in a business combi...,Indefinite-Lived Intangible Assets,0.040637,0.157746,0.801617
3718,2023-03-17,WMT,10-K,10-K|Part2|7,We are involved in a number of legal proceedin...,Contingencies,0.025207,0.364944,0.609849
3719,2023-03-17,WMT,10-K,10-K|Part2|7,Income taxes have a significant effect on our ...,Income Taxes,0.042157,0.149861,0.807983


In [78]:
final_df.to_excel('output_FinBert.xlsx')

### References

1. finbert https://colab.research.google.com/drive/1C6_ahu0Eps_wLKcsfspEO0HIEouND-oI?usp=sharing#scrollTo=UcIdpwL4DABS
2. finbert https://wandb.ai/ivangoncharov/FinBERT_Sentiment_Analysis_Project/reports/Financial-Sentiment-Analysis-on-Stock-Market-Headlines-With-FinBERT-Hugging-Face--VmlldzoxMDQ4NjM0
3. chucks https://github.com/rohan-paul/MachineLearning-DeepLearning-Code-for-my-YouTube-Channel/blob/master/NLP/FinBERT_Long_Text.ipynb
4. chucks https://www.youtube.com/watch?v=WEAAs_0etJQ