# Developing the Sentiment Analysis Model for Financial Sentiment Data

## Dataset Loading and Preprocessing

### Loading in the Financial Sentiment Dataset with Sentiment Mapping

In [2]:
import pandas as pd

financial_sentiment_data = pd.read_csv('Financial_Sentiment_Analysis_Dataset.csv')

financial_sentiment_data['Sentiment'] = financial_sentiment_data['Sentiment'].map({
    'negative': 0,
    'neutral': 1,
    'positive': 2
})

financial_sentiment_data

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,2
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",0
2,"For the last quarter of 2010 , Componenta 's n...",2
3,According to the Finnish-Russian Chamber of Co...,1
4,The Swedish buyout firm has sold its remaining...,1
...,...,...
5837,RISING costs have forced packaging producer Hu...,0
5838,Nordic Walking was first used as a summer trai...,1
5839,"According shipping company Viking Line , the E...",1
5840,"In the building and home improvement trade , s...",1


### Dataset Tokenization

In [3]:
from transformers import BertTokenizer
from datasets import Dataset
import numpy as np

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

financial_sentiment_data['Sentiment'] = financial_sentiment_data['Sentiment'].astype(np.int32)

financial_sentiment_data.rename(columns={'Sentence': 'text'}, inplace=True)
financial_sentiment_data.rename(columns={'Sentiment': 'label'}, inplace=True)

model_tuning_dataset = Dataset.from_pandas(financial_sentiment_data)
tokenized_data = model_tuning_dataset.map(tokenize_function, batched=True)

tokenized_data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/5842 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5842
})

### Train-Test Split

In [4]:
from torch.utils.data import DataLoader

financial_sentiment_data_split = tokenized_data.train_test_split(test_size=0.3)
train_dataset = financial_sentiment_data_split['train']
test_dataset = financial_sentiment_data_split['test']

train_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

## BERT Model Fine-Tuning

### Model Definition

In [5]:
from transformers import BertForSequenceClassification
from torch.optim import AdamW

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Model Training

In [6]:
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./bert-model-results',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to=[],
    logging_dir='./logs',
    logging_steps=50,
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4352,0.452395,0.804335,0.810723,0.827324,0.804335
2,0.3772,0.466922,0.798631,0.791185,0.789339,0.798631
3,0.2299,0.549308,0.807758,0.795976,0.796575,0.807758


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=1536, training_loss=0.3943191412836313, metrics={'train_runtime': 1316.0447, 'train_samples_per_second': 9.321, 'train_steps_per_second': 1.167, 'total_flos': 3227612295269376.0, 'train_loss': 0.3943191412836313, 'epoch': 3.0})

### Model Evaluation

In [7]:
metrics = trainer.evaluate()
print("\nFinal Evaluation Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")

  return forward_call(*args, **kwargs)



Final Evaluation Metrics:
eval_loss: 0.4524
eval_accuracy: 0.8043
eval_f1: 0.8107
eval_precision: 0.8273
eval_recall: 0.8043
eval_runtime: 47.3378
eval_samples_per_second: 37.0320
eval_steps_per_second: 4.6470
epoch: 3.0000


In [8]:
from sklearn.metrics import classification_report
import torch

print("\nDetailed Classification Report:")
model.eval()
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

label_names = ['Negative', 'Neutral', 'Positive']
print(classification_report(y_true, y_pred, target_names=label_names))


Detailed Classification Report:


  return forward_call(*args, **kwargs)


              precision    recall  f1-score   support

    Negative       0.54      0.78      0.64       247
     Neutral       0.90      0.79      0.84       943
    Positive       0.83      0.85      0.84       563

    accuracy                           0.80      1753
   macro avg       0.76      0.80      0.77      1753
weighted avg       0.83      0.80      0.81      1753



### Saving the Trained Model

In [9]:
trainer.save_model('./saved_sentiment_model')
tokenizer.save_pretrained('./saved_sentiment_model')

print("Model and tokenizer saved to './saved_sentiment_model'")

Model and tokenizer saved to './saved_sentiment_model'
