In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report
import torch

# Load dataset
file_path = '/content/financial_phrase_bank_final.csv'
df = pd.read_csv(file_path)

# Ensure labels are in the correct format
label_mapping = {1: 2, 2: 0, 0: 1}  # Negative: 0, Neutral: 1, Positive: 2
df['label'] = df['label'].map(label_mapping)

# Check for NaN values in the 'label' column after mapping
if df['label'].isnull().any():
    print("There are NaN values in the label column after mapping.")
else:
    print("No NaN values in the label column after mapping.")

# Check for and drop any rows with missing values
df.dropna(subset=['text', 'label'], inplace=True)

texts = df['text'].tolist()
labels = df['label'].tolist()

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.1, stratify=labels)


No NaN values in the label column after mapping.


In [13]:
def evaluate_model_with_report(model_name, texts, labels, batch_size=32):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model.eval()

    all_predictions = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1).numpy()
        all_predictions.extend(predictions)

    report = classification_report(labels, all_predictions, target_names=['negative', 'neutral', 'positive'], output_dict=True)
    return report, all_predictions


In [14]:
def evaluate_gpt2_model_with_report(texts, labels, batch_size=32):
    model_name = "gpt2"
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2ForSequenceClassification.from_pretrained(model_name)

    # Add padding token for GPT-2
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id
    model.eval()

    all_predictions = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1).numpy()
        all_predictions.extend(predictions)

    report = classification_report(labels, all_predictions, target_names=['negative', 'neutral', 'positive'], output_dict=True)
    return report, all_predictions


In [15]:
# FinBERT model
finbert_model = "yiyanghkust/finbert-tone"
finbert_report, finbert_predictions = evaluate_model_with_report(finbert_model, val_texts, val_labels)
print("FinBERT Classification Report:")
print(classification_report(val_labels, finbert_predictions, target_names=['negative', 'neutral', 'positive']))




FinBERT Classification Report:
              precision    recall  f1-score   support

    negative       0.72      0.92      0.81       288
     neutral       0.77      0.48      0.59       136
    positive       0.71      0.39      0.51        61

    accuracy                           0.73       485
   macro avg       0.73      0.60      0.63       485
weighted avg       0.73      0.73      0.71       485



In [16]:
#GPT-2 model
gpt2_report, gpt2_predictions = evaluate_gpt2_model_with_report(val_texts, val_labels)
print("GPT-2 Classification Report:")
print(classification_report(val_labels, gpt2_predictions, target_names=['negative', 'neutral', 'positive']))


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT-2 Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       288
     neutral       0.28      1.00      0.44       136
    positive       0.00      0.00      0.00        61

    accuracy                           0.28       485
   macro avg       0.09      0.33      0.15       485
weighted avg       0.08      0.28      0.12       485



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
# BERT model
bert_model = "bert-base-uncased"
bert_report, bert_predictions = evaluate_model_with_report(bert_model, val_texts, val_labels)
print("BERT Classification Report:")
print(classification_report(val_labels, bert_predictions, target_names=['negative', 'neutral', 'positive']))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


BERT Classification Report:
              precision    recall  f1-score   support

    negative       0.55      0.04      0.08       288
     neutral       0.28      0.96      0.43       136
    positive       0.00      0.00      0.00        61

    accuracy                           0.29       485
   macro avg       0.28      0.33      0.17       485
weighted avg       0.40      0.29      0.17       485



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
#RoBERTa model
roberta_model = "roberta-base"
roberta_report, roberta_predictions = evaluate_model_with_report(roberta_model, val_texts, val_labels)
print("RoBERTa Classification Report:")
print(classification_report(val_labels, roberta_predictions, target_names=['neutral', 'negative', 'positive']))


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa Classification Report:
              precision    recall  f1-score   support

     neutral       0.00      0.00      0.00       288
    negative       0.28      1.00      0.44       136
    positive       0.00      0.00      0.00        61

    accuracy                           0.28       485
   macro avg       0.09      0.33      0.15       485
weighted avg       0.08      0.28      0.12       485



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
