In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification
from sklearn.metrics import classification_report
import torch

# Load datasets
training_file_path = '/Users/yaseminsilen/Downloads/tweet_financial_training_set_final.csv'
validation_file_path = '/Users/yaseminsilen/Downloads/tweet_financial_validation_set_final.csv'

training_data = pd.read_csv(training_file_path)
validation_data = pd.read_csv(validation_file_path)

# Ensure labels are in the correct format
label_mapping = {1: 2, 2: 0, 0: 1}  # Negative: 0, Neutral: 1, Positive: 2
training_data['label'] = training_data['label'].map(label_mapping)
validation_data['label'] = validation_data['label'].map(label_mapping)

# Check for NaN values in the 'label' column after mapping
if training_data['label'].isnull().any() or validation_data['label'].isnull().any():
    print("There are NaN values in the label column after mapping.")
else:
    print("No NaN values in the label column after mapping.")

# Check for and drop any rows with missing values
training_data.dropna(subset=['text', 'label'], inplace=True)
validation_data.dropna(subset=['text', 'label'], inplace=True)

# Convert to lists
train_texts = training_data['text'].tolist()
train_labels = training_data['label'].tolist()
val_texts = validation_data['text'].tolist()
val_labels = validation_data['label'].tolist()


No NaN values in the label column after mapping.


In [2]:
def evaluate_model_with_report(model_name, texts, labels, batch_size=32):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)  # Ensure the number of labels is set correctly
    model.eval()

    all_predictions = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        all_predictions.extend(predictions)

    report = classification_report(labels, all_predictions, target_names=['negative', 'neutral', 'positive'], output_dict=True)
    return report, all_predictions

In [3]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
def evaluate_gpt2_model_with_report(texts, labels, batch_size=32):
    model_name = "gpt2"
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=3)

    # Add padding token for GPT-2
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id
    model.eval()

    all_predictions = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        all_predictions.extend(predictions)

    report = classification_report(labels, all_predictions, target_names=['negative', 'neutral', 'positive'], output_dict=True)
    return report, all_predictions


In [4]:
# Evaluate FinBERT model
finbert_model = "yiyanghkust/finbert-tone"
finbert_report, finbert_predictions = evaluate_model_with_report(finbert_model, val_texts, val_labels)
print("FinBERT Classification Report:")
print(classification_report(val_labels, finbert_predictions, target_names=['negative', 'neutral', 'positive']))


  return self.fget.__get__(instance, owner)()


FinBERT Classification Report:
              precision    recall  f1-score   support

    negative       0.76      0.81      0.79      1566
     neutral       0.04      0.05      0.05       347
    positive       0.07      0.05      0.06       475

    accuracy                           0.55      2388
   macro avg       0.29      0.30      0.30      2388
weighted avg       0.52      0.55      0.53      2388



In [5]:
# Evaluate GPT-2 model
gpt2_report, gpt2_predictions = evaluate_gpt2_model_with_report(val_texts, val_labels)
print("GPT-2 Classification Report:")
print(classification_report(val_labels, gpt2_predictions, target_names=['negative', 'neutral', 'positive']))


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT-2 Classification Report:
              precision    recall  f1-score   support

    negative       0.65      0.52      0.57      1566
     neutral       0.00      0.00      0.00       347
    positive       0.20      0.49      0.29       475

    accuracy                           0.44      2388
   macro avg       0.28      0.33      0.29      2388
weighted avg       0.47      0.44      0.43      2388



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
# Evaluate BERT model
bert_model = "bert-base-uncased"
bert_report, bert_predictions = evaluate_model_with_report(bert_model, val_texts, val_labels)
print("BERT Classification Report:")
print(classification_report(val_labels, bert_predictions, target_names=['negative', 'neutral', 'positive']))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Classification Report:
              precision    recall  f1-score   support

    negative       0.50      0.10      0.17      1566
     neutral       0.00      0.00      0.00       347
    positive       0.19      0.82      0.31       475

    accuracy                           0.23      2388
   macro avg       0.23      0.31      0.16      2388
weighted avg       0.36      0.23      0.17      2388



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**FinBERT:**

High precision, recall, and F1-score for the negative class.
Poor performance for neutral and positive classes.
Shows a balanced performance but needs improvement for neutral and positive classes.

**GPT-2:**

High recall for the positive class.
Fails completely for the negative and neutral classes, leading to an overall very low accuracy.
Not suitable as it cannot distinguish between negative and neutral sentiments.

**BERT:**

Reasonably good performance for the negative class.
Mediocre performance for the neutral class and poor performance for the positive class.
Similar to FinBERT but with slightly lower performance metrics.


FinBERT shows a relatively balanced performance compared to GPT-2 and BERT, even though it struggles with the neutral and positive classes. GPT-2 and BERT have significant issues with classifying neutral and positive sentiments, making them less suitable without extensive further tuning.

It has the best overall balance, especially with negative sentiment classification, which is crucial in financial sentiment analysis.
It offers a more solid foundation to improve upon, particularly in enhancing its performance for neutral and positive classes.

In [7]:
pip install datasets



In [8]:
pip install transformers torch scikit-learn



In [9]:
pip install accelerate -U



In [10]:
pip install transformers[torch]



In [18]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from datasets import Dataset

# Tokenize the data
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels}).map(tokenize_function, batched=True)
val_dataset = Dataset.from_dict({'text': val_texts, 'label': val_labels}).map(tokenize_function, batched=True)



Map:   0%|          | 0/9539 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/2388 [00:00<?, ? examples/s]

In [19]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
from datasets import Dataset

# Define the metrics function to evaluate the model
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [21]:
# Define the Trainer
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone", num_labels=3)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [11]:
# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7247,0.501567,0.802764,0.808883,0.823074,0.802764
2,0.4135,0.44644,0.843384,0.843946,0.844954,0.843384


Evaluation results: {'eval_loss': 0.44643956422805786, 'eval_accuracy': 0.8433835845896147, 'eval_f1': 0.8439458213847024, 'eval_precision': 0.8449543300962101, 'eval_recall': 0.8433835845896147, 'eval_runtime': 10.2874, 'eval_samples_per_second': 232.128, 'eval_steps_per_second': 14.581, 'epoch': 2.0}


**Interpretation of Results**

**Training Loss:** Significant decrease from 0.724700 to 0.413500, indicating the model is learning effectively during training.

**Validation Loss:** Decrease from 0.501567 to 0.446440, showing improved generalization.

**Accuracy:** Increase from 0.802764 to 0.843384, indicating better overall performance on the validation set.

**F1 Score, Precision, and Recall:** All metrics have improved from Epoch 1 to Epoch 2, demonstrating balanced performance across different aspects of classification.

**High Accuracy and F1 Score:**

The high accuracy (0.843384) and F1 score (0.843946) in Epoch 2 suggest that the model is performing well in terms of both correctness and balance between precision and recall.

In [24]:
# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1059,1.194948,0.824539,0.828813,0.838711,0.824539
2,0.081,1.345668,0.847152,0.846864,0.846651,0.847152
3,0.0513,1.485655,0.848827,0.847787,0.847391,0.848827
4,0.0268,1.583484,0.84464,0.845779,0.84737,0.84464


KeyboardInterrupt: 


**Training and Validation Loss:**

Training loss consistently decreases, indicating that the model is learning the training data well.
Validation loss initially decreases but starts increasing significantly after Epoch 2, suggesting potential overfitting from Epoch 3 onwards.

**Accuracy and F1 Score:**

Accuracy and F1 score improve up to Epoch 3 but slightly decrease in Epoch 4, reinforcing the indication of overfitting.
Precision and recall follow a similar pattern.

**Next Steps After Stopping at Epoch 3**
Since stopping at Epoch 3 is optimal to prevent overfitting, here are the subsequent steps to further enhance the model's performance and ensure robust results:

*Hyperparameter Tuning:*

We will perform hyperparameter tuning to find the best settings for the model. We consider to use techniques like grid search or random search to explore different combinations of learning rates, batch sizes, and other hyperparameters.

*Cross-Validation:*

Conducting k-fold cross-validation to ensure the model's performance is consistent across different subsets of the data. This provides a more robust evaluation and helps in understanding the model's generalization capabilities.

*Feature Engineering:*

Exploring additional features that can be added to the model. For example, incorporating sentiment lexicons, financial indicators, or other domain-specific features might improve the model's performance.

*Regularization Techniques:*

Applying regularization techniques such as dropout or weight decay to prevent overfitting and improve generalization.

*Fine-Tuning on Additional Data:*

If more labeled data becomes available, we will continue fine-tuning the model to enhance its performance further.