# Notebook for training BERT, FinBERT and DistilROBERTa

In [5]:

# Load libraries
import pandas as pd

import numpy as np
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
)

from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    AutoModelForSequenceClassification,
    AutoTokenizer
)

from datasets import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# mount on google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Directly load data set
data = pd.read_csv("data/balanced_sentiment_data_small.csv")


## Data preparation

In [7]:
df = pd.read_csv("data/balanced_sentiment_data_small.csv")
df = data.copy()

# map the word labels to numbers
label_mapping = {"negative": 0, "neutral": 1, "positive": 2}
df["label"] = df["sentiment"].map(label_mapping)
df["label"].value_counts()
df["label"] = df["label"].astype(int)

# Split into Train/Validation/Test as 75/15/15 %.
#  Train+Val vs Test (15% test)
train_val_df, test_df = train_test_split(
    df,
    test_size=0.15,
    stratify=df["label"],
    shuffle=True,
    random_state=42,
)

# Train vs Val (from the remaining 85% -> 15% val overall)
train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.1765,               # 0.1765 * 0.85 ≈ 0.15 of full data
    stratify=train_val_df["label"],
    shuffle=True,
    random_state=42,
)

# Add split column back to full df
df = df.copy()
df["split"] = "train"
df.loc[val_df.index, "split"] = "val"
df.loc[test_df.index, "split"] = "test"


# Split into train/val/test dataframes
train_df = df[df["split"] == "train"].reset_index(drop=True)
val_df   = df[df["split"] == "val"].reset_index(drop=True)
test_df  = df[df["split"] == "test"].reset_index(drop=True)

# count the values for each label in each of train, val, test
print(train_df["label"].value_counts(), "\n")
print(val_df["label"].value_counts(), "\n")
print(test_df["label"].value_counts(), "\n")


label
1    2689
2    2689
0    2689
Name: count, dtype: int64 

label
1    577
0    577
2    576
Name: count, dtype: int64 

label
2    577
1    576
0    576
Name: count, dtype: int64 



## Set up and finetune BERT model

In [9]:
# convert to huggingface datasets (only keep text + label)
train_ds = Dataset.from_pandas(train_df[["text", "label"]])
val_ds   = Dataset.from_pandas(val_df[["text", "label"]])
test_ds  = Dataset.from_pandas(test_df[["text", "label"]])

# load BERT
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# will be used throughout code
max_length = 128

# tokenizing function for our data
def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

train_ds_tok = train_ds.map(tokenize_function, batched=True)
val_ds_tok   = val_ds.map(tokenize_function, batched=True)
test_ds_tok  = test_ds.map(tokenize_function, batched=True)

# Tell HF which columns are inputs
train_ds_tok = train_ds_tok.remove_columns(["text"])
val_ds_tok   = val_ds_tok.remove_columns(["text"])
test_ds_tok  = test_ds_tok.remove_columns(["text"])

train_ds_tok.set_format("torch")
val_ds_tok.set_format("torch")
test_ds_tok.set_format("torch")

num_labels = 3

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)


Map:   0%|          | 0/8067 [00:00<?, ? examples/s]

Map:   0%|          | 0/1730 [00:00<?, ? examples/s]

Map:   0%|          | 0/1729 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
#  a function which will compute evaluation metrics

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)

    # macro scores
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )

    # per-class scores: order will be [0, 1, 2]
    precision_cls, recall_cls, f1_cls, _ = precision_recall_fscore_support(
        labels, preds, average=None, zero_division=0
    )

    return {
        "accuracy": acc,
        "macro_f1": f1_macro,
        "macro_precision": precision_macro,
        "macro_recall": recall_macro,
        "f1_neg": f1_cls[0],
        "f1_neu": f1_cls[1],
        "f1_pos": f1_cls[2],
    }

In [12]:
# Set up the training arguments and the trainer

batch_size = 16
num_epochs = 3

training_args = TrainingArguments(
    output_dir="./bert_base_uncased_fin_sentiment",
    eval_strategy="epoch",      # evaluate on val at end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,

    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,

    logging_dir="./logs",
    logging_steps=50,

    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds_tok,
    eval_dataset=val_ds_tok,
    compute_metrics=compute_metrics,
)

In [None]:

# Fine-tune BERT on train subset
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Macro Precision,Macro Recall,F1 Neg,F1 Neu,F1 Pos
1,0.4647,0.423956,0.830636,0.828342,0.832561,0.830661,0.868268,0.775355,0.841402
2,0.2773,0.441124,0.844509,0.844617,0.844861,0.844504,0.874459,0.813036,0.846356
3,0.0949,0.631966,0.846821,0.846413,0.846461,0.84683,0.87575,0.811003,0.852487


TrainOutput(global_step=1515, training_loss=0.32272061164623045, metrics={'train_runtime': 578.8904, 'train_samples_per_second': 41.806, 'train_steps_per_second': 2.617, 'total_flos': 1591901955608832.0, 'train_loss': 0.32272061164623045, 'epoch': 3.0})

In [13]:
# Save the finetuned BERT
save_dir_bert = "/content/drive/MyDrive/final_models_kur/bert_base_uncased_finetuned"
# Save model weights + config
trainer.save_model(save_dir_bert)

# Save tokenizer as well (important!)
tokenizer.save_pretrained(save_dir_bert)

print("Saved to:", save_dir_bert)





Saved to: /content/drive/MyDrive/final_models_kur/bert_base_uncased_finetuned


## Set up and finetune FinBERT

In [14]:

# load from HF
model_name_finbert = "ProsusAI/finbert"
tokenizer_finbert = AutoTokenizer.from_pretrained(model_name_finbert)

max_length = 128

# tokenizing function
def tokenize_finbert(batch):
    return tokenizer_finbert(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )



train_ds_finbert = train_ds.map(tokenize_finbert, batched=True)
val_ds_finbert   = val_ds.map(tokenize_finbert, batched=True)
test_ds_finbert  = test_ds.map(tokenize_finbert, batched=True)

# remove raw text, keep only tensors + labels
train_ds_finbert = train_ds_finbert.remove_columns(["text"])
val_ds_finbert   = val_ds_finbert.remove_columns(["text"])
test_ds_finbert  = test_ds_finbert.remove_columns(["text"])

train_ds_finbert.set_format("torch")
val_ds_finbert.set_format("torch")
test_ds_finbert.set_format("torch")



tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/8067 [00:00<?, ? examples/s]

Map:   0%|          | 0/1730 [00:00<?, ? examples/s]

Map:   0%|          | 0/1729 [00:00<?, ? examples/s]

In [15]:

# Load the FinBERT model with 3 labels
model_finbert = AutoModelForSequenceClassification.from_pretrained(
    model_name_finbert,
    num_labels=3,  # 3 classes: negative, neutral, positive
    id2label={0: "negative", 1: "neutral", 2: "positive"},
    label2id={"negative": 0, "neutral": 1, "positive": 2},)


batch_size = 16
num_epochs = 3

training_args_finbert = TrainingArguments(
    output_dir="./finbert_finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,

    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,

    logging_dir="./logs_finbert",
    logging_steps=50,
    report_to="none",
)



pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [17]:

# set up Trainer
trainer_finbert = Trainer(
    model=model_finbert,
    args=training_args_finbert,
    train_dataset=train_ds_finbert,
    eval_dataset=val_ds_finbert,
    compute_metrics=compute_metrics,
)

In [18]:
# Train finbert
# Train the model
trainer_finbert.train()



Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Macro Precision,Macro Recall,F1 Neg,F1 Neu,F1 Pos
1,0.4209,0.452065,0.833526,0.829302,0.838663,0.833564,0.881439,0.759369,0.847097
2,0.2633,0.413381,0.855491,0.855553,0.855856,0.855487,0.885641,0.819588,0.86143
3,0.091,0.621608,0.857803,0.857686,0.857592,0.857811,0.888505,0.816042,0.868512


TrainOutput(global_step=1515, training_loss=0.30860772357128635, metrics={'train_runtime': 603.3686, 'train_samples_per_second': 40.11, 'train_steps_per_second': 2.511, 'total_flos': 1591901955608832.0, 'train_loss': 0.30860772357128635, 'epoch': 3.0})

In [19]:
# Save finetuned FinBERT

save_dir_finbert = "/content/drive/MyDrive/final_models_kur/finbert_finetuned_new"

trainer_finbert.save_model(save_dir_finbert)
tokenizer_finbert.save_pretrained(save_dir_finbert)

('/content/drive/MyDrive/final_models_kur/finbert_finetuned_new/tokenizer_config.json',
 '/content/drive/MyDrive/final_models_kur/finbert_finetuned_new/special_tokens_map.json',
 '/content/drive/MyDrive/final_models_kur/finbert_finetuned_new/vocab.txt',
 '/content/drive/MyDrive/final_models_kur/finbert_finetuned_new/added_tokens.json',
 '/content/drive/MyDrive/final_models_kur/finbert_finetuned_new/tokenizer.json')

## Set up and finetune DistilRoBERTa

In [20]:
# Load from HF
model_name_distil = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
tokenizer_distil = AutoTokenizer.from_pretrained(model_name_distil)

# tokenizing function
def tokenize_distil(batch):
    return tokenizer_distil(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

# use train_ds, val_ds, test_ds (text + label)
train_ds_distil = train_ds.map(tokenize_distil, batched=True)
val_ds_distil   = val_ds.map(tokenize_distil, batched=True)
test_ds_distil  = test_ds.map(tokenize_distil, batched=True)

# Remove raw text, keep only needed tensors + labels
train_ds_distil = train_ds_distil.remove_columns(["text"])
val_ds_distil   = val_ds_distil.remove_columns(["text"])
test_ds_distil  = test_ds_distil.remove_columns(["text"])

train_ds_distil.set_format("torch")
val_ds_distil.set_format("torch")
test_ds_distil.set_format("torch")



tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Map:   0%|          | 0/8067 [00:00<?, ? examples/s]

Map:   0%|          | 0/1730 [00:00<?, ? examples/s]

Map:   0%|          | 0/1729 [00:00<?, ? examples/s]

In [21]:
# Set up training arguments for DistilRoBERTa

batch_size = 16
num_epochs = 3

training_args_distil = TrainingArguments(
    output_dir="./distilroberta_financial_finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,

    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,

    logging_dir="./logs_distil",
    logging_steps=50,
    report_to="none",
)

In [22]:

# Load the financial DistilRoBERTa with its existing sentiment head
model_distil = AutoModelForSequenceClassification.from_pretrained(model_name_distil)

trainer_distil = Trainer(
    model= model_distil,
    args=training_args_distil,
    train_dataset=train_ds_distil,
    eval_dataset=val_ds_distil,
    compute_metrics=compute_metrics,  # same as before
)


config.json:   0%|          | 0.00/933 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

In [23]:
# Fine-tune DistilRoBERTa on train
trainer_distil.train()

Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Macro Precision,Macro Recall,F1 Neg,F1 Neu,F1 Pos
1,0.4472,0.437708,0.831792,0.829636,0.834875,0.831834,0.874251,0.772296,0.842361
2,0.3089,0.411118,0.856647,0.856691,0.85673,0.856654,0.886383,0.814879,0.86881
3,0.1697,0.509283,0.854913,0.854798,0.85474,0.854927,0.878472,0.815009,0.870912


TrainOutput(global_step=1515, training_loss=0.34491729508138724, metrics={'train_runtime': 322.5247, 'train_samples_per_second': 75.036, 'train_steps_per_second': 4.697, 'total_flos': 801475171635456.0, 'train_loss': 0.34491729508138724, 'epoch': 3.0})

In [25]:
# Save finetuned DistilRoBERTa
save_dir_distillroberta = "/content/drive/MyDrive/final_models_kur/distillroberta_finetuned_new"

trainer_distil.save_model(save_dir_distillroberta)
tokenizer_distil.save_pretrained(save_dir_distillroberta)

('/content/drive/MyDrive/final_models_kur/distillroberta_finetuned_new/tokenizer_config.json',
 '/content/drive/MyDrive/final_models_kur/distillroberta_finetuned_new/special_tokens_map.json',
 '/content/drive/MyDrive/final_models_kur/distillroberta_finetuned_new/vocab.json',
 '/content/drive/MyDrive/final_models_kur/distillroberta_finetuned_new/merges.txt',
 '/content/drive/MyDrive/final_models_kur/distillroberta_finetuned_new/added_tokens.json',
 '/content/drive/MyDrive/final_models_kur/distillroberta_finetuned_new/tokenizer.json')