In [1]:
!pip install -q -U transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m90.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

import torch
import torch.nn as nn

import transformers
from transformers import (AutoTokenizer, 
                          AutoModelForSequenceClassification,
                          TrainingArguments, 
                          Trainer, 
                          AutoModelForMaskedLM,AutoConfig)

from datasets import load_dataset
from datasets import Dataset

from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

2025-05-28 19:17:21.347661: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748459841.534410      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748459841.589863      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
print(f"transformers=={transformers.__version__}")

transformers==4.52.3


In [6]:
model_name = "FacebookAI/roberta-base"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the label mapping
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

# Load the model and specify the number of labels
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
filename = "../input/sentiment-analysis-for-financial-news/all-data.csv"

df = pd.read_csv(filename, 
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")

X_train = list()
X_test = list()
for sentiment in ["positive", "neutral", "negative"]:
    train, test  = train_test_split(df[df.sentiment==sentiment], 
                                    train_size=300,
                                    test_size=300, 
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)

eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
X_eval = df[df.index.isin(eval_idx)]
X_eval = (X_eval
          .groupby('sentiment', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
X_train = X_train.reset_index(drop=True)

In [8]:
# Also, ensure your label mapping uses the dictionary defined above for consistency
for df in [X_train, X_test, X_eval]:
    df["labels"] = df.sentiment.map(label2id) # Use the label2id dictionary

In [9]:
train_data = Dataset.from_pandas(X_train)
test_data = Dataset.from_pandas(X_test)
eval_data = Dataset.from_pandas(X_eval)

In [10]:
#define the tokenizer function
def tokenizer_function(x):
    return tokenizer(
        x["text"],
        padding="max_length",  
        truncation=True,       
        max_length=512,
        return_tensors="pt"
    )

In [11]:
train_data = train_data.map(tokenizer_function, batched=True).remove_columns(['text', 'sentiment'])
test_data = test_data.map(tokenizer_function, batched=True).remove_columns(['text', 'sentiment'])
eval_data = eval_data.map(tokenizer_function, batched=True).remove_columns(['text', 'sentiment'])

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [12]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
 
# Metric helper method
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    score = f1_score(labels, predictions, average="weighted")
    return {"f1": score}

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # The predictions are raw logits
    predictions = np.argmax(predictions, axis=1)

    # Calculate weighted F1 score
    f1 = f1_score(labels, predictions, average="weighted")

    # Calculate accuracy
    accuracy = accuracy_score(labels, predictions)

    # Return both metrics in a dictionary
    return {"f1": f1, "accuracy": accuracy}

In [13]:
#define training arguments 
train_batch, val_batch = (8, 8)
lr = 2e-5
n_epochs = 40

training_args = TrainingArguments(
    output_dir=f"fine_tuned_{model_name.split('/')[1]}",
    learning_rate=lr,
    per_device_train_batch_size=train_batch,
    per_device_eval_batch_size=val_batch,
    num_train_epochs=n_epochs,
    lr_scheduler_type="linear",
    optim="adamw_torch",
    logging_strategy="epoch",
    eval_strategy="epoch",        
    save_strategy="epoch",        
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    bf16=True,                   
    bf16_full_eval=True,          
    push_to_hub=False,
    report_to="none"            
)


In [14]:
#Create a Trainer instance
trainer = Trainer(
    model=model,                         # The pre-trained model
    args=training_args,                  # Training arguments
    train_dataset=train_data,            # Tokenized training dataset
    eval_dataset=test_data,              # Tokenized test dataset
    compute_metrics=compute_metrics,
)

In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.7756,0.481427,0.779003,0.79
2,0.3803,0.530373,0.844697,0.845556
3,0.2686,0.696157,0.846988,0.852222
4,0.2178,0.70123,0.860767,0.861111
5,0.1118,0.841965,0.847201,0.845556
6,0.0784,0.844842,0.864154,0.864444
7,0.0517,0.872078,0.873001,0.873333
8,0.0393,0.881956,0.87023,0.871111
9,0.0214,0.914487,0.875678,0.876667
10,0.0122,0.950562,0.872614,0.873333


TrainOutput(global_step=4520, training_loss=0.05291666332797139, metrics={'train_runtime': 3392.5519, 'train_samples_per_second': 10.611, 'train_steps_per_second': 1.332, 'total_flos': 9472083038208000.0, 'train_loss': 0.05291666332797139, 'epoch': 40.0})

In [16]:
evaluation_results = trainer.evaluate(eval_data)
print(f"Evaluation Results -  f1 score: {evaluation_results['eval_f1']:0.5f} | accuracy: {evaluation_results['eval_accuracy']:0.5f}")

Evaluation Results -  f1 score: 0.86435 | accuracy: 0.86667


In [17]:
# Save the trained model 
model.save_pretrained("./saved_model")
# Save the tokenizer
tokenizer.save_pretrained("./saved_model")

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.json',
 './saved_model/merges.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')