In [31]:
pip install transformers torch scikit-learn matplotlib pandas




In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split

file_path = '/content/financial_phrase_bank_final.csv'
df = pd.read_csv(file_path)


texts = df['text'].tolist()
labels = df['label'].tolist()

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.1, stratify=labels)

In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification
from sklearn.metrics import classification_report
import torch

def evaluate_model_with_report(model_name, texts, labels, batch_size=32):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model.eval()

    all_predictions = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1).numpy()
        all_predictions.extend(predictions)

    report = classification_report(labels, all_predictions, target_names=['negative', 'neutral', 'positive'], output_dict=True)
    return report, all_predictions

def evaluate_gpt2_model_with_report(texts, labels, batch_size=32):
    model_name = "gpt2"
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2ForSequenceClassification.from_pretrained(model_name)

    # Add padding token for GPT-2
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id
    model.eval()

    all_predictions = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1).numpy()
        all_predictions.extend(predictions)

    report = classification_report(labels, all_predictions, target_names=['negative', 'neutral', 'positive'], output_dict=True)
    return report, all_predictions


In [24]:
# FinBERT model
finbert_model = "yiyanghkust/finbert-tone"
finbert_report, finbert_predictions = evaluate_model_with_report(finbert_model, val_texts, val_labels)
print("FinBERT Classification Report:")
print(classification_report(val_labels, finbert_predictions, target_names=['negative', 'neutral', 'positive']))


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


FinBERT Classification Report:
              precision    recall  f1-score   support

    negative       0.08      0.49      0.14        61
     neutral       0.26      0.08      0.12       288
    positive       0.06      0.01      0.02       136

    accuracy                           0.11       485
   macro avg       0.13      0.20      0.10       485
weighted avg       0.18      0.11      0.10       485



The low performance of FinBERT on your sentiment analysis task can be attributed to several factors, including lack of fine-tuning, class imbalance, and possibly incorrect data preprocessing.

In [25]:
# GPT-2 model
gpt2_report, gpt2_predictions = evaluate_gpt2_model_with_report(val_texts, val_labels)
print("GPT-2 Classification Report:")
print(classification_report(val_labels, gpt2_predictions, target_names=['negative', 'neutral', 'positive']))


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT-2 Classification Report:
              precision    recall  f1-score   support

    negative       0.13      1.00      0.22        61
     neutral       0.00      0.00      0.00       288
    positive       0.00      0.00      0.00       136

    accuracy                           0.13       485
   macro avg       0.04      0.33      0.07       485
weighted avg       0.02      0.13      0.03       485



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The extremely low performance of GPT-2 for the sentiment analysis task is likely because GPT-2 is not primarily designed for classification tasks. GPT-2 is a generative language model, which means it's optimized to generate text rather than classify it. While it can be adapted for classification tasks, it usually requires additional fine-tuning on the specific task and dataset to perform well. I tried to use it because its opensource

In [26]:
#  BERT model
bert_model = "bert-base-uncased"
bert_report, bert_predictions = evaluate_model_with_report(bert_model, val_texts, val_labels)
print("BERT Classification Report:")
print(classification_report(val_labels, bert_predictions, target_names=['negative', 'neutral', 'positive']))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Classification Report:
              precision    recall  f1-score   support

    negative       0.11      0.77      0.19        61
     neutral       0.55      0.11      0.19       288
    positive       0.00      0.00      0.00       136

    accuracy                           0.16       485
   macro avg       0.22      0.30      0.13       485
weighted avg       0.34      0.16      0.14       485



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
