### **FINANCE NEWS - SENTIMENT ANALYSIS**

**Financial PhraseBank:** A dataset containing financial news sentences annotated with sentiment labels.

**sentences_allagree:** A specific subset of this dataset where every annotator provided the same sentiment label for each sentence. There are 2264 entries in this dataset.

For the sentences_allagree subset, the labels are encoded as follows:


*   0: Negative sentiment
*   1: Neutral sentiment
*   2: Positive sentiment

In [None]:
!pip install datasets

In [7]:
import os
import shutil

# Define the directory to be cleared
output_dir = '/kaggle/working/'

# Function to clear a directory
def clear_directory(directory):
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')

# Clear the output directory
clear_directory(output_dir)

# Verify the directory is empty
print("Directory contents after clearing:", os.listdir(output_dir))

import torch

# Clear CUDA cache
torch.cuda.empty_cache()

print("CUDA cache cleared.")


Directory contents after clearing: []
CUDA cache cleared.


In [8]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import torch
import numpy as np
from datasets import load_dataset
import re
from bs4 import BeautifulSoup
import html
import random
from sklearn.metrics import accuracy_score
import pandas as pd
device = 'cuda'


# Define a directory to save the model in Google Drive
output_dir = '//kaggle/working/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
# Load dataset
dataset = load_dataset('financial_phrasebank', 'sentences_allagree')
df = pd.DataFrame(dataset['train'])

In [None]:
# Load dataset
dataset = load_dataset('financial_phrasebank', 'sentences_75agree')
df = pd.DataFrame(dataset['train'])

In [None]:
dataset

In [None]:
# Extract model input and output
texts = dataset['train']['sentence']
labels = dataset['train']['label']

# Split the dataset into 90% training and 10% validation
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.1, random_state=42)

### **Load the stored model**

In [None]:
# Load the saved model and tokenizer
print("Loading model from %s" % output_dir)

model = BertForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)

# Move the model to the appropriate device
model = model.to(device)

### **Create a new model and train**

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=3)
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)

In [None]:
# Tokenize input
print('Tokenizing the input...')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')

#Convert to tensors
print('Converting to tensors...')
train_inputs = train_encodings['input_ids'].to(device)
train_masks = train_encodings['attention_mask'].to(device)
train_outputs = torch.tensor(train_labels).to(device)

#Create DataLoader
print('Loading the data...')
train_dataset = TensorDataset(train_inputs, train_masks, train_outputs)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=32)

In [None]:
#Training loop
print('Training...')
model.train()
for epoch in range(30):
  total_loss = 0
  for step, batch in enumerate(train_dataloader):
    b_input_ids, b_input_mask, b_labels = batch
    optimizer.zero_grad()
    outputs = model(b_input_ids, labels=b_labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  print(f"Epoch {epoch + 1} --> Total Loss: {total_loss}")

In [None]:
# Define a directory to save the model in Google Drive
output_dir = '//kaggle/working/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save a trained model, configuration and tokenizer
print("Saving model to %s" % output_dir)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

### **TESTING THE MODEL WITH THE VALIDATION DATA**

In [None]:
# Tokenize output
print('Tokenizing the output...')
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors='pt')

#Convert to tensors
print('Converting to tensors...')
val_inputs = val_encodings['input_ids'].to(device)
val_masks = val_encodings['attention_mask'].to(device)
val_outputs = torch.tensor(val_labels).to(device)

#Create DataLoader
print('Loading the data...')
val_dataset = TensorDataset(val_inputs, val_masks, val_outputs)
val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=32)

In [None]:
print('Evaluating...')
model.eval()
val_pred_labels, val_true_labels = [], []
with torch.no_grad():
    for batch in val_dataloader:
        b_input_ids, b_input_mask, b_labels = batch
        outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
        logits = outputs.logits
        val_pred_labels.extend(torch.argmax(logits, dim=1).cpu().numpy())
        val_true_labels.extend(b_labels.cpu().numpy())

accuracy = accuracy_score(val_true_labels, val_pred_labels)
print(f'Validation Accuracy: {accuracy}')



In [None]:
# Evaluate the model over 30 loops
num_loops = 30
accuracies = []

for _ in range(num_loops):
    model.eval()
    val_pred_labels, val_true_labels = [], []

    with torch.no_grad():
        for batch in val_dataloader:
            b_input_ids, b_input_mask, b_labels = batch
            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits
            val_pred_labels.extend(torch.argmax(logits, dim=1).cpu().numpy())
            val_true_labels.extend(b_labels.cpu().numpy())

    # Calculate accuracy for this loop
    accuracy = accuracy_score(val_true_labels, val_pred_labels)
    accuracies.append(accuracy)

# Calculate the average accuracy
average_accuracy = np.mean(accuracies)
print(f'Average Validation Accuracy over {num_loops} loops: {average_accuracy}')


# Baseline Testing (bert-large-uncased)

Clear workspace/cache

In [None]:
#clearing cache & workspace before each run

import os
import shutil

# Define the directory to be cleared
output_dir = '/kaggle/working/'

# Function to clear a directory
def clear_directory(directory):
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')

# Clear the output directory
clear_directory(output_dir)

# Verify the directory is empty
print("Directory contents after clearing:", os.listdir(output_dir))

import torch

# Clear CUDA cache
torch.cuda.empty_cache()

print("CUDA cache cleared.")

Accuracy loop

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
import torch
from datasets import load_dataset
from sklearn.metrics import accuracy_score
import numpy as np

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load dataset
dataset = load_dataset('financial_phrasebank', 'sentences_allagree')
texts = dataset['train']['sentence']
labels = dataset['train']['label']

# Split the dataset into 90% training and 10% validation
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.1, random_state=42)

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=3)
model = model.to(device)

# Tokenize the validation data
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors='pt')

# Convert to tensors
val_inputs = val_encodings['input_ids'].to(device)
val_masks = val_encodings['attention_mask'].to(device)
val_outputs = torch.tensor(val_labels).to(device)

# Create DataLoader
val_dataset = TensorDataset(val_inputs, val_masks, val_outputs)
val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=32)

# Evaluate the model over 30 loops
num_loops = 30
accuracies = []

for _ in range(num_loops):
    model.eval()
    val_pred_labels, val_true_labels = [], []

    with torch.no_grad():
        for batch in val_dataloader:
            b_input_ids, b_input_mask, b_labels = batch
            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits
            val_pred_labels.extend(torch.argmax(logits, dim=1).cpu().numpy())
            val_true_labels.extend(b_labels.cpu().numpy())

    # Calculate accuracy for this loop
    accuracy = accuracy_score(val_true_labels, val_pred_labels)
    accuracies.append(accuracy)

# Calculate the average accuracy
average_accuracy = np.mean(accuracies)
print(f'Average Validation Accuracy over {num_loops} loops: {average_accuracy}')


# Combined Function

Combined Baseline

In [None]:
def bert_large_test(data):
  dataset=load_dataset('financial_phrasebank', 'sentences_75agree')
 
  # Extract model input and output
  texts = dataset['train']['sentence']
  labels = dataset['train']['label']
 
  # Split the dataset into 90% training and 10% validation
  train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.1, random_state=42)
 
  tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
 
  model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=3)
  model = model.to(device)
 
  optimizer = AdamW(model.parameters(), lr=1e-5)
 
  # Evaluation
  model.eval()
  val_pred_labels, val_true_labels = [], []
  with torch.no_grad():
      for batch in val_dataloader:
          b_input_ids, b_input_mask, b_labels = batch
          outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
          logits = outputs.logits
          val_pred_labels.extend(torch.argmax(logits, dim=1).cpu().numpy())
          val_true_labels.extend(b_labels.cpu().numpy())
 
  # Find misclassified examples
  misclassified_examples = []
  for i in range(len(val_true_labels)):
      if val_true_labels[i] != val_pred_labels[i]:
          misclassified_examples.append((val_texts[i], val_true_labels[i], val_pred_labels[i]))
 
  # Print some misclassified examples
  print("Misclassified examples:")
  for text, true_label, pred_label in misclassified_examples[:5]:
      print(f"Text: {text}")
      print(f"True label: {true_label}")
      print(f"Predicted label: {pred_label}")
  return accuracy_score(val_true_labels, val_pred_labels)

In [None]:
Combined Training

In [9]:
def bert_large_test(data):
  dataset=load_dataset('financial_phrasebank', data)
 
  # Extract model input and output
  texts = dataset['train']['sentence']
  labels = dataset['train']['label']
 
  # Split the dataset into 90% training and 10% validation
  train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.1, random_state=42)
 
  tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
 
  model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=3)
  model = model.to(device)
 
  optimizer = AdamW(model.parameters(), lr=1e-5)
 
  # Tokenize input
  train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')
 
  #Convert to tensors
  train_inputs = train_encodings['input_ids'].to(device)
  train_masks = train_encodings['attention_mask'].to(device)
  train_outputs = torch.tensor(train_labels).to(device)
 
  #Create DataLoader
  train_dataset = TensorDataset(train_inputs, train_masks, train_outputs)
  train_sampler = RandomSampler(train_dataset)
  train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=32)
 
  #Training loop
  model.train()
  for epoch in range(30):
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
      b_input_ids, b_input_mask, b_labels = batch
      optimizer.zero_grad()
      outputs = model(b_input_ids, labels=b_labels)
      loss = outputs.loss
      loss.backward()
      optimizer.step()
      total_loss += loss.item()
    print(f"Epoch {epoch + 1} --> Total Loss: {total_loss}")
 
    # Tokenize output
  val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors='pt')
 
  #Convert to tensors
  val_inputs = val_encodings['input_ids'].to(device)
  val_masks = val_encodings['attention_mask'].to(device)
  val_outputs = torch.tensor(val_labels).to(device)
 
  #Create DataLoader
  val_dataset = TensorDataset(val_inputs, val_masks, val_outputs)
  val_sampler = SequentialSampler(val_dataset)
  val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=32)
 
  # Evaluation
  model.eval()
  val_pred_labels, val_true_labels = [], []
  with torch.no_grad():
      for batch in val_dataloader:
          b_input_ids, b_input_mask, b_labels = batch
          outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
          logits = outputs.logits
          val_pred_labels.extend(torch.argmax(logits, dim=1).cpu().numpy())
          val_true_labels.extend(b_labels.cpu().numpy())
 
  # Find misclassified examples
  misclassified_examples = []
  for i in range(len(val_true_labels)):
      if val_true_labels[i] != val_pred_labels[i]:
          misclassified_examples.append((val_texts[i], val_true_labels[i], val_pred_labels[i]))
 
  # Print some misclassified examples
  print("Misclassified examples:")
  for text, true_label, pred_label in misclassified_examples[:5]:
      print(f"Text: {text}")
      print(f"True label: {true_label}")
      print(f"Predicted label: {pred_label}")
  return accuracy_score(val_true_labels, val_pred_labels)

In [10]:
data = 'sentences_75agree'
# Call the bert_large_test function
bert_large_test(data)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1 --> Total Loss: 84.81674826145172
Epoch 2 --> Total Loss: 57.49486793577671
Epoch 3 --> Total Loss: 31.330335520207882
Epoch 4 --> Total Loss: 21.327181842178106
Epoch 5 --> Total Loss: 12.293487310409546
Epoch 6 --> Total Loss: 8.036630846560001
Epoch 7 --> Total Loss: 5.304461032152176
Epoch 8 --> Total Loss: 4.985310522606596
Epoch 9 --> Total Loss: 2.336323172552511
Epoch 10 --> Total Loss: 3.396993040689267
Epoch 11 --> Total Loss: 3.1719777225516737
Epoch 12 --> Total Loss: 1.7226171565125696
Epoch 13 --> Total Loss: 0.7230826548184268
Epoch 14 --> Total Loss: 0.7050131310825236
Epoch 15 --> Total Loss: 3.035689419368282
Epoch 16 --> Total Loss: 1.754714650567621
Epoch 17 --> Total Loss: 0.5950087097007781
Epoch 18 --> Total Loss: 1.0077748318435624
Epoch 19 --> Total Loss: 0.4295839100959711
Epoch 20 --> Total Loss: 0.47974305824027397
Epoch 21 --> Total Loss: 0.3834576520312112
Epoch 22 --> Total Loss: 0.06556349530001171
Epoch 23 --> Total Loss: 0.04994536090816837
Epo

0.8554913294797688

### **TESTING THE DATA WITH CUSTOM DATA**

In [None]:
# Function to predict labels for custom inputs
def predict_custom_sentences(sentences, model, tokenizer, device):
    # Tokenize the input sentences
    encodings = tokenizer(sentences, truncation=True, padding=True, return_tensors='pt')

    # Move the encodings to the appropriate device
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)

    # Put the model in evaluation mode
    model.eval()

    # Make predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # Get the predicted labels
    preds = torch.argmax(logits, dim=1).cpu().numpy()

    return preds

# Function to map label integers to words
def label_to_word(label):
    label_dict = {0: 'negative', 1: 'neutral', 2: 'positive'}
    return label_dict.get(label, 'unknown')

In [None]:
# Example usage
custom_sentences = [
    "The company reported a significant increase in revenue.",
    "There are concerns about the sustainability of the growth.",
    "The new product launch has been very successful.",
    "Despite a challenging market environment, the company's strategic decisions have led to considerable improvements in their financial performance.",
    "The recent partnership with a major tech firm is expected to drive innovation and increase market share in the coming years.",
    "My profit last year was $10. This year it is $8.",
    "My profit last year was $10. This year it is reduced to $8"
]

# Ensure the model and tokenizer are already loaded and configured
predicted_labels = predict_custom_sentences(custom_sentences, model, tokenizer, device)

# Map predicted labels to words
predicted_labels_words = [label_to_word(label) for label in predicted_labels]

# Create a DataFrame
df_predictions = pd.DataFrame({
    'Financial News': custom_sentences,
    'Predicted Label': predicted_labels_words
})

# Display the DataFrame
print(df_predictions)

BERT and other transformer-based models are pre-trained on large corpora and are adept at capturing the nuances of language, but they aren't explicitly designed to handle numerical reasoning or arithmetic operations. When dealing with sentences containing numerical data, the model might rely more on the surrounding context and words rather than understanding the numerical relationships.