# Training / Fine-tuning a Text Summarisation model

We are going to look at model fine-tuning by taking a general text summarisation model and fine-tuning it to perform dialogue summarisation

In [None]:
!pip install accelerate -U
!pip install transformers -U
!pip install datasets
!pip install py7zr
!pip install tiktoken
!pip install sentencepiece
!pip install evaluate
!pip install rouge_score

In [None]:
import transformers
from transformers import pipeline, set_seed
from datasets import load_dataset
import py7zr
import accelerate
import pandas as pd
import torch

## A dialogue summarisation dataset

In [None]:
dataset_samsum = load_dataset("samsum",trust_remote_code=True)
split_lengths = [len(dataset_samsum[split])for split in dataset_samsum]

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_samsum['train'].column_names}")
print("\nDialogue:")
print(dataset_samsum["test"][0]["dialogue"])
print("\nSummary:")
print(dataset_samsum["test"][0]["summary"])

### Evaluating PEGASUS on SAMSum

<img alt="pegasus" width="700" caption="Diagram of PEGASUS architecture (courtesy of Jingqing Zhang et al.)" src="https://github.com/nlp-with-transformers/notebooks/blob/main/images/chapter08_pegasus.png?raw=1" id="pegasus"/>

In [None]:
from transformers import AutoModelForSeq2SeqLM, PegasusTokenizer
import matplotlib.pyplot as plt
import tiktoken
import sentencepiece
#device="mps"
device="cuda"
model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = PegasusTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

In [None]:
torch.cuda.empty_cache()
input_ = tokenizer.batch_encode_plus(dataset_samsum["test"][0:1]["dialogue"], max_length=1024, pad_to_max_length=True,truncation=True, padding='longest', return_tensors="pt")
input_ids = input_['input_ids']
input_mask = input_['attention_mask']
summaries = model.generate(input_ids=input_ids.to(device),
                         attention_mask=input_mask.to(device),
                         num_beams=100,
                         no_repeat_ngram_size=2,
                         early_stopping=True,
                         num_return_sequences=10,
                         max_length=64)
summaries = tokenizer.batch_decode(summaries, skip_special_tokens=True)
summaries=[summary.replace("<n>", "\n") for summary in summaries]

In [None]:
print(summaries[0])

### Fine-Tuning PEGASUS

To fine tune model uncomment the following 5 code blocks and run. Note though that it will take a good 5-6 hours to run.

In [None]:
#def convert_examples_to_features(example_batch):
#    input_encodings = tokenizer(example_batch["dialogue"], max_length=1024,
#                                truncation=True)

#    with tokenizer.as_target_tokenizer():
#        target_encodings = tokenizer(example_batch["summary"], max_length=128,
#                                     truncation=True)

#    return {"input_ids": input_encodings["input_ids"],
#            "attention_mask": input_encodings["attention_mask"],
#            "labels": target_encodings["input_ids"]}

#dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features,
#                                       batched=True)
#columns = ["input_ids", "labels", "attention_mask"]
#dataset_samsum_pt.set_format(type="torch", columns=columns)

In [None]:
#from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

#seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

#training_args = TrainingArguments(
#    output_dir='pegasus-samsum', num_train_epochs=20, warmup_steps=500,
#    per_device_train_batch_size=1, per_device_eval_batch_size=1,
#    weight_decay=0.01, logging_steps=10, push_to_hub=False,
#    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,gradient_accumulation_steps=128)

#trainer = Trainer(model=model, args=training_args,
#                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
#                  train_dataset=dataset_samsum_pt["train"],
#                  eval_dataset=dataset_samsum_pt["validation"])

In [None]:
#!pip install wandb

In [None]:
#import wandb
#from huggingface_hub import notebook_login

#notebook_login()
#wandb.init(mode="disabled")

In [None]:
# hide_output
#torch.cuda.empty_cache()
#trainer.train()
# To save your fine-tuned model:
#trainer.save_model("dialogue-summ-model")

To load an already fine tuned model uncomment the following cell and run it

In [None]:
model_ckpt="transformersbook/pegasus-samsum"
tokenizer = PegasusTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

### Generating And Evaluating Dialogue Summaries

In [None]:
torch.cuda.empty_cache()
input_ = tokenizer.batch_encode_plus(dataset_samsum["test"][0:1]["dialogue"], max_length=1024, pad_to_max_length=True,truncation=True, padding='longest', return_tensors="pt")
input_ids = input_['input_ids']
input_mask = input_['attention_mask']
summaries_ft = model.generate(input_ids=input_ids.to(device),
                         attention_mask=input_mask.to(device),
                         num_beams=100,
                         no_repeat_ngram_size=2,
                         early_stopping=True,
                         num_return_sequences=1,
                         max_length=64,
                          )
summaries_ft = tokenizer.batch_decode(summaries_ft, skip_special_tokens=True)
summaries_ft=[summary.replace("<n>", "\n") for summary in summaries_ft]

In [None]:
reference=dataset_samsum["test"][0:1]["summary"]


Activity 1: Write a function to calculate Rouge-N




Activity 2: Write a function to calculate Rouge-L, with B=1

Activity 3: Compare the two different Rouge scores achieved by the base PEGASUS model and the fine-tuned model.

Activity 4: Familiarise yourself with the generate function options: https://huggingface.co/docs/transformers/en/main_classes/text_generation
Vary the setting and observe how they effect output for the two models.