# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW

import torch
from torch.utils.data import Dataset, DataLoader


In [2]:
data = pd.read_csv("/kaggle/input/financial-sentiment-analysis/data.csv")
data.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [3]:
len(data)

5842

In [4]:
data['Sentiment'] = data['Sentiment'].map({'positive' : 1, 'negative' : 2, 'neutral' : 0})
data.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,1
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",2
2,"For the last quarter of 2010 , Componenta 's n...",1
3,According to the Finnish-Russian Chamber of Co...,0
4,The Swedish buyout firm has sold its remaining...,0


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['Sentence'], data['Sentiment'], 
                                                    shuffle=True, random_state=42, 
                                                    stratify=data['Sentiment'])

# Tokenization

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
def tokenize_text(text):
    return tokenizer(text=text, max_length=128, padding=True, truncation=True, return_tensors='pt', add_special_tokens=True)

In [8]:
tokenized_train = tokenize_text(X_train.tolist())
tokenized_test = tokenize_text(X_test.tolist())

In [9]:
tokenized_train[:1]

{'input_ids': tensor([[  101,  1037,  2047, 24888,  3240,  2102,  2422,  3528,  1997, 10514,
           9626, 15987,  2368, 21877, 19648,  2072, 11625,  2104,  1996,  2012,
           4360,  4435,  2171,  2003,  2085,  2800,  2000, 10390,  1999,  6435,
           1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,    

In [10]:
tokenized_train['input_ids'].numpy().shape, tokenized_test['input_ids'].numpy().shape

((4381, 128), (1461, 76))

# Model

In [11]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Preparing the Pytorch Dataset

In [12]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key : torch.tensor(value[idx]) for key, value in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

In [13]:
train_dataset = SentimentDataset(tokenized_train, y_train.tolist())
test_dataset = SentimentDataset(tokenized_test, y_test.tolist())

train_dataloader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=16)

# Training

In [14]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

optimizer = AdamW(params=model.parameters(), lr=5e-5)

for epoch in range(8):
    model.train()
    
    for batch in train_dataloader:
        batch = {key : value.to(device) for key, value in batch.items()}
        
        outputs = model(**batch)
        loss = outputs.loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f'Epoch {epoch+1}, Loss : {loss.item()} ')

Epoch 1, Loss : 0.39756518602371216 
Epoch 2, Loss : 0.8607915043830872 
Epoch 3, Loss : 0.17319455742835999 
Epoch 4, Loss : 0.18104881048202515 
Epoch 5, Loss : 0.170566588640213 
Epoch 6, Loss : 0.10558636486530304 
Epoch 7, Loss : 0.08808493614196777 
Epoch 8, Loss : 0.05237000808119774 


# Evaluation

In [15]:
model.eval()

def evaluate(dataloader):
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for batch in dataloader:
            batch = {key : value.to(device) for key, value in batch.items()}
            
            outputs = model(**batch)
            logits = outputs.logits
            
            predictions = torch.argmax(logits, dim=-1)
            correct_predictions += (predictions == batch['labels']).sum().tolist()
            total_predictions += len(batch['labels'])
            
        accuracy = correct_predictions / total_predictions
        return accuracy

test_accuracy = evaluate(test_dataloader)
print(f'Test Accuracy: {test_accuracy:.4f}')

Test Accuracy: 0.7981


# Example Inference

In [16]:
def map_to_class(pred):
    maps = {1 : 'Positive', 2 : 'Negative', 0 : 'Neutral'}
    return maps.get(pred)

In [17]:
sentence = "The company had a great performance last quarter."
inputs = tokenizer(sentence, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1).item()

print(f'Predicted class: {map_to_class(predicted_class)}')

Predicted class: Positive


In [18]:
# Example inference on one sentence
sentence = "The company had a bad performance last quarter."
inputs = tokenizer(sentence, return_tensors="pt").to(device)

# Get predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1).item()

print(f'Predicted class: {map_to_class(predicted_class)}')

Predicted class: Neutral


In [20]:
model_save_path = 'finetuned_bert_for_financial_sentiment_analysis'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('finetuned_bert_for_financial_sentiment_analysis/tokenizer_config.json',
 'finetuned_bert_for_financial_sentiment_analysis/special_tokens_map.json',
 'finetuned_bert_for_financial_sentiment_analysis/vocab.txt',
 'finetuned_bert_for_financial_sentiment_analysis/added_tokens.json')