# CS614 Assignment 1 - LLM Training Code

In [1]:
!pip install transformers peft evaluate datasets

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import evaluate, torch, numpy as np, time, transformers
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset, get_dataset_split_names

## **Dataset**:
The dataset is obtained from https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment. The dataset consists of 11,931 finance-related tweets and is used to train and evaluate the performance of sequence classification models on sentiment classification.

## **Task:**
Summarise news articles using the selected LLM.

In [3]:
#Load dataset
ds = load_dataset("zeroshot/twitter-financial-news-sentiment")


README.md: 0.00B [00:00, ?B/s]

sent_train.csv: 0.00B [00:00, ?B/s]

sent_valid.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/9543 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2388 [00:00<?, ? examples/s]

In [4]:
#Get split names
get_dataset_split_names("zeroshot/twitter-financial-news-sentiment")

['train', 'validation']

In [54]:
#load train, validation and test dataset
train = ds["train"].shuffle(seed=42)
val = ds["validation"].shuffle(seed=42)

train_size, val_size = int(0.8*len(train)), len(train)-int(0.8*len(train))
train_dataset = train.select(range(train_size))
val_dataset = train.select(range(train_size,))
test_dataset = val

In [55]:
#check the attributes (features) of dataset
train.features

{'text': Value('string'), 'label': Value('int64')}

`text`: Financial-related tweet
<br>`label`: Reference sentiment (0: Bearish, 1: Bullish, 2: Neutral)

In [56]:
#check random subset of data
train[19]

{'text': 'Extreme Networks +3% after $100M buyback', 'label': 1}

In [57]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


## **Import Model:**

BERT (Sequence Classification model of 110 million parameters) is used to perform this sentiment classification task. bert-cased variant is used as Capitalisation of letters can convey different meanings in tweets such as company names, emotions.

In [58]:
model_name = "google-bert/bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [68]:
#create functions to tokenize and compute evaluation metric
def tokenize_text(tweets):
  return tokenizer(tweets["text"], return_tensors="pt", padding="max_length").to("cuda")

accuracy = evaluate.load("accuracy")
f1_macro = evaluate.load("f1")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  # return accuracy.compute(predictions=predictions, references=labels)
  return {"Accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"], "F1_macro":f1_macro.compute(predictions=predictions, references=labels, average="macro")["f1"]}

In [69]:
train_dataset = train_dataset.map(tokenize_text, batched=True)
val_dataset = val_dataset.map(tokenize_text, batched=True)
test_dataset = test_dataset.map(tokenize_text, batched=True)

Map:   0%|          | 0/7634 [00:00<?, ? examples/s]

Map:   0%|          | 0/2388 [00:00<?, ? examples/s]

## Evaluation: Accuracy and F1-Score (macro)
Accuracy - determines how well the model performs overall in classifying the tweet sentiment correctly
macro-F1 - measures how well the model can classify each sentiment class accurately, by averaging F1 over number of classes. This is robust against datasets with class imbalances (which is useful as tweets dataset have the majority class of neutral tweets and hence, the model has a higher probability of getting higher accuracy in predicting neutral when it is unsure). If minority F1 score low, it will show in the macro F1 score.

In [73]:
#set baseline hyperparameters - using TrainingArguments default values
base_training_args = TrainingArguments(
    report_to="none",
    num_train_epochs=3,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0, #regularisation - same effect as dropout (reduce overfitting by reducing weights)
    warmup_ratio=0,
    gradient_accumulation_steps=1,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    logging_dir='./logs',
    logging_steps=10,
    logging_strategy="epoch",
    output_dir="test_trainer",
    eval_strategy="epoch")

In [74]:
#training default settings
def model_instance():
    transformers.set_seed(42) #to initialise model at same checkpoint
    return AutoModelForSequenceClassification.from_pretrained(model_name, dtype="auto", num_labels=3).to(device)

base_trainer = Trainer(
    model=model_instance(),
    args=base_training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [75]:
def train_LLM(trainer_class):
    start_time = time.time()
    trainer_class.train()
    end_time = time.time()
    time_taken = end_time - start_time
    print(time_taken)

In [76]:
train_LLM(base_trainer)

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.7359,0.431106,0.854467,0.790453
2,0.4534,0.1994,0.937385,0.910903
3,0.2547,0.126894,0.96699,0.952601


3038.1695165634155


In [79]:
#evaluate performance of base BERT on test dataset
pred_output = base_trainer.predict(test_dataset)
pred_logits = pred_output.predictions
true_labels = pred_output.label_ids
test_metrics = pred_output.metrics #get metrics
print(test_metrics)

{'test_loss': 0.5871909856796265, 'test_Accuracy': 0.8584589614740369, 'test_F1_macro': 0.806901445472986, 'test_runtime': 67.9133, 'test_samples_per_second': 35.162, 'test_steps_per_second': 4.403}


In [84]:
from google.colab import drive
drive.mount('/content/drive')
# save model
directory_for_base_bert = "/content/drive/MyDrive/CS614_models/bert-trained-financial-tweet-sentiment"
base_trainer.save_model(directory_for_base_bert)
# base_model = AutoModelForSequenceClassification.from_pretrained(directory_for_base_bert)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Tune hyperparameters (Full fine tune)

In [80]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate",[1e-5, 5e-5]),
        "weight_decay":  trial.suggest_float("weight_decay", 0.0, 0.1),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 5),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.0, 0.1),
    }

# objective to MAXIMIZE (macro-F1). Note: keys are usually prefixed with "eval_"
def compute_objective(metrics):
    return metrics.get("eval_macro_f1", metrics["macro_f1"])

# 5) Trainer + HPO call
optuna_args = TrainingArguments(
    output_dir="out",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    logging_steps=10,
)

tuned_trainer = Trainer(
      args=optuna_args,
      model=model_instance(),           # <-- important for fair trials
      train_dataset=train_dataset,
      eval_dataset=val_dataset,
      compute_metrics=compute_metrics,
      model=model_instance(),
      data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
  )

best = trainer.hyperparameter_search(
    direction="maximize",            # <-- this is your “maximum”
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=20,                     # start small; scale up if needed
    compute_objective=compute_objective,
)

print(best)               # BestRun(hyperparameters=..., objective=...)
print(best.hyperparameters, best.objective)

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
training_args = TrainingArguments(
    num_train_epochs=10,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01, #regularisation - same effect as dropout (reduce overfitting by reducing weights)
    warmup_ratio=0.1,
    gradient_accumulation_steps=2,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    logging_dir='./logs',
    logging_steps=10,
    output_dir="test_trainer",
    eval_strategy="epoch",
    load_best_model_at_end=True)

In [None]:
#training based on hyperparameters stated in previous cell
tuned_trainer = Trainer(
    model=model_instance(),
    args=base_training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

### LoRA finetuning

In [None]:
lora_config = LoraConfig(
    r = 8 # low-rank (as BERT is considered small)
    lora_alpha=32,  # Scaling factor for LORA
    target_modules=["query", "value"],  # Target modules to apply LORA
    lora_dropout=0.1,  # Dropout rate for LORA
    bias="none",  # Bias type (none, all, or a list of target modules)
    task_type="SEQ_CLS"
)

In [None]:
# Save adapters
peft_model.save_pretrained("models/bert-fin-sentiment-lora")
tokenizer.save_pretrained("models/bert-fin-sentiment-lora")

# Load later
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel, PeftConfig
base = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=3)
peft = PeftModel.from_pretrained(base, "models/bert-fin-sentiment-lora")
tok  = AutoTokenizer.from_pretrained("models/bert-fin-sentiment-lora")
