# CS614 Assignment 1 - LLM Training Code

In [1]:
!pip install transformers evaluate datasets
!pip install bert_score rouge_score



In [5]:
    from huggingface_hub import login
    from google.colab import userdata

    # Load the token from Colab secrets
    HF_TOKEN = userdata.get('HF_TOKEN')

    # Log in to Hugging Face
    login(token=HF_TOKEN)

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import evaluate, torch, numpy as np
from datasets import load_dataset, get_dataset_split_names

## **Dataset**:
The dataset is obtained from https://huggingface.co/datasets/abisee/cnn_dailymail. It contains more than three thousand news articles from CNN and Daily Mail, and used for extractive and abstractive summarisation.

## **Task:**
Summarise news articles using the selected LLM.

In [3]:
#Load dataset
ds = load_dataset("abisee/cnn_dailymail", "3.0.0") #use most common config


In [4]:
#Get split names
get_dataset_split_names("abisee/cnn_dailymail", "3.0.0")

['train', 'validation', 'test']

In [5]:
#load train, validation and test dataset
train = ds["train"].shuffle(seed=42)
val = ds["validation"].shuffle(seed=42)
test = ds["test"].shuffle(seed=42)

#load small subset to speed up training
small_train_dataset = train.select(range(1000))
small_eval_dataset = val.select(range(1000))

In [6]:
#check random subset of data
small_train_dataset[19]

{'article': '(CNN) -- Four suspects are sought in connection with the shooting death over the weekend of a Houston, Texas, doctor, Austin County authorities said Monday. Dr. Jorge Mario Gonzales was found shot to death at his ranch in rural Texas on Saturday, police say. Dr. Jorge Mario Gonzalez, 56, was chief of the critical care section at Houston\'s Methodist Hospital and "a pulmonary medicine leader," according to the hospital system\'s Web site. He was found dead Saturday when deputies responded to a 911 call of a burglary in progress shortly after noon, said Austin County Sheriff\'s Office spokesman Sgt. Paul Faircloth. The responding officer was met by vehicles leaving the location, Faircloth said, and a person in one vehicle fired on the officer. The officer and his car were not struck, and the officer did not return fire, Faircloth said. The officer was able to provide a detailed description of the vehicles, he said. At the home, which Faircloth said is in a rural, wooded and 

In [7]:
#check the attributes (features) of dataset
train.features

{'article': Value('string'),
 'highlights': Value('string'),
 'id': Value('string')}

`article`: Original article
<br>`highlights`: Summary reference

## **Import Model:**

Since data privacy is not a key concern in this task, an open-source model is used. A small model (1B) is used to keep cost low.

In [8]:
model_name = "google/gemma-3-1b-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).eval() #disable dropout
bert_score = evaluate.load("bertscore")
rouge_score = evaluate.load("rouge", use_aggregator=True)

In [None]:
#evaluate model before training
def eval_model(batch):
  articles = [f"Summarise this into one paragraph: {article}\nSummary:\n" for article in batch["article"]]
  tokens = tokenizer.batch_encode(batch, return_tensors="pt").to("cuda")
  with torch.no_grad():
    response = model.generate(tokens, max_new_tokens=256, use_cache=True)
  batch["summary"] = tokenizer.batch_decode(response, skip_special_tokens=True)
  return batch

summaries = small_eval_dataset.map(eval_model, batched=True, batch_size=8)
print(summaries)


In [None]:
#create functions to tokenize and compute evaluation metric
def tokenize_text(text):
  return tokenizer.encode(text, return_tensors="pt").to("cuda")

def compute_metrics(eval_preds):
  logits, labels = eval_preds
  preds = np.argmax(logits, axis=1)
  rouge_score.compute(predictions=preds, references=val["highlights"])
  bert_score.compute(predictions=preds, references=val["highlights"], model_type = "roberta-large")
  return bert_score, rouge_score


  return rouge_score
# def compute_metrics(eval_pred):
#   logits, labels = eval_pred
#   predictions = np.argmax(logits, axis=1)
#   return metric.compute(predictions=predictions, references=labels)

Original text: ROME, Italy (CNN) -- The garbage crisis in Naples encompasses the worst Italian clichés, and in particular those of the southern part of this lovely peninsula: mismanagement, political interference, mafia profiteering and the ability of those responsible to deflect the attention and the blame elsewhere. Naples has had problems in finding sites for municipal dumps -- now workers have stopped collecting trash. There is a popular saying here that roughly goes like this: everybody is competent enough (to find a solution) but nobody is responsible (for actually carrying it out). In many parts of the world waste disposal is a business -- and usually it is a good business. Garbage can be transformed into various sources of energy and then sold for a profit. In Naples, garbage is also good business, but in the sense that millions, if not billions, of euros have been wasted -- and nobody really knows how. The problem is as old and ugly as rotten trash. The region's dumps reached 

## Evaluation: BERTScore & ROUGE
Both BERTScore and ROUGE will be used to evaluate the model's performance.
ROUGE-L will evaluate the model's performance based on longest common subsequence based scoring. while BERTScore will be used to evaluate paraphrasing and semantic similarity of the reference text and their corresponding summary, which can be a more accurate representation of the model's performance.

In [None]:
training_args = TrainingArguments(
    report_to="none",
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01, #regularisation - same effect as dropout (reduce overfitting by reducing weights)
    warmup_ratio=0.1,
    gradient_accumulation_steps=2,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    logging_dir='./logs',
    logging_steps=10,
    output_dir="test_trainer",
    eval_strategy="epoch")

In [None]:
#training based on hyperparameters stated in previous cell
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)