In [None]:
!pip install transformers
!pip install sumeval
!pip install py7zr

# Decription 
### Fine-Tune a T5 model on the samsum dataset using Pytorch and HugingFace.

In [50]:
import gc
import random
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sumeval.metrics.rouge import RougeCalculator

import torch
from transformers import AutoTokenizer
import transformers
from transformers import AutoModelForSeq2SeqLM

print('Pytorch version: %s'  % torch.__version__)

Pytorch version: 1.11.0


In [52]:
warnings.simplefilter('ignore')
pd.set_option('display.max_colwidth', 100)
cuda =  torch.cuda.is_available()
device = torch.device("cuda") if cuda else torch.device("cpu")

## Reading Data

In [53]:
from datasets import load_dataset
dataset = load_dataset("samsum")

  0%|          | 0/3 [00:00<?, ?it/s]

In [54]:
train = dataset["train"]
valid = dataset["validation"]
test = dataset["test"]

train = train.remove_columns(["id"])
valid = valid.remove_columns(["id"])
test = test.remove_columns(["id"])

print(len(train), len(valid), len(test))
print("dataset has features: ", train)
print("sample input and output is")
print(train[0]["dialogue"])
print(train[0]["summary"])

14732 818 819
dataset has features:  Dataset({
    features: ['dialogue', 'summary'],
    num_rows: 14732
})
sample input and output is
Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)
Amanda baked cookies and will bring Jerry some tomorrow.


## Data Analysis

In [55]:
# analyze input and summary lengths
dialogue_lengths = [len(text.split()) for text in train["dialogue"]]
summary_lengths = [len(text.split()) for text in train["summary"]]
print("average length of dialogue is", sum(dialogue_lengths)/len(dialogue_lengths))
print("average length of summary is", sum(summary_lengths)/len(summary_lengths))

average length of dialogue is 93.7863833831116
average length of summary is 20.3174721694271


## Modeling

In [65]:
# t5-small = 60M parameters
# t5-base = 220M parameters
# t5-large = 770M parameters
# t5-3b = 3 billion
# t5-11b = 11 billion
# Loading tokenizer of t5 model
tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [66]:
# prompting the model to do summarisation
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    model_inputs["input_ids"] = model_inputs["input_ids"]

    labels = tokenizer(text=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [67]:
tokenized_train = train.map(preprocess_function, batched=True)
tokenized_valid = valid.map(preprocess_function, batched=True)
tokenized_test = test.map(preprocess_function, batched=True)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [68]:
tokenized_train = tokenized_train.remove_columns(["dialogue"]).remove_columns(["summary"]).remove_columns(["attention_mask"])
tokenized_valid = tokenized_valid.remove_columns(["dialogue"]).remove_columns(["summary"]).remove_columns(["attention_mask"])
tokenized_test = tokenized_test.remove_columns(["dialogue"]).remove_columns(["summary"]).remove_columns(["attention_mask"])

tokenized_train.set_format("torch")
tokenized_valid.set_format("torch")
tokenized_test.set_format("torch")

print("data format now is", tokenized_train)

data format now is Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 14732
})


## Loading the model

In [69]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")  # has 60M parameters

In [70]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data.dataloader import default_collate

def pad_collate(batch):
    xx = [x["input_ids"] for x in batch]
    yy = [x["labels"] for x in batch]
    x_lens = [len(x) for x in xx]
    y_lens = [len(y) for y in yy]

    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
    yy_pad = pad_sequence(yy, batch_first=True, padding_value=0)
    
    data = [{"input_ids": x, "labels": y} for x,y in zip(xx_pad, yy_pad)]
    
    return default_collate(data)

train_dataloader = DataLoader(tokenized_train, shuffle=True, batch_size=8, collate_fn=pad_collate)
eval_dataloader = DataLoader(tokenized_valid, batch_size=8, collate_fn=pad_collate)

In [63]:
from transformers import get_scheduler
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 4
num_training_steps = num_epochs * len(train_dataloader)
print("Training steps are", num_training_steps)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

Training steps are 7368


In [78]:
rouge = RougeCalculator(stopwords=True, lang="en")

def rouge_calc(preds, targets):
    rouge_1 = [rouge.rouge_n(summary=preds[i],references=targets[i],n=1) for i in range(len(preds))]
    rouge_2 = [rouge.rouge_n(summary=preds[i],references=targets[i],n=2) for i in range(len(preds))]
    rouge_l = [rouge.rouge_l(summary=preds[i],references=targets[i]) for i in range(len(preds))]

    return {"Rouge_1": np.array(rouge_1).mean(),
            "Rouge_2": np.array(rouge_2).mean(),
            "Rouge_L": np.array(rouge_l).mean()}

def evaluate(model, eval_dataloader, tokenizer):
    prediction = []
    ground_truth = []
    losses = []
    for eval_batch in eval_dataloader:
        eval_batch = {k: v.to(device) for k, v in eval_batch.items()}
        outputs = model.generate(eval_batch['input_ids'])
        losses.append(model(**eval_batch).loss.item())
        for i in range(0,len(eval_batch)):
            prediction.append(tokenizer.decode(outputs[i], skip_special_tokens=True))
            ground_truth.append(tokenizer.decode(eval_batch['labels'][i], skip_special_tokens=True))

    scores = rouge_calc(prediction , ground_truth)
    avg_loss = sum(losses)/len(losses)
    print("Validation data score and losses are", scores, avg_loss)
    
    

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.to(device)
model.train()

step = 0
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
        # evaluate on every 100th step
        if step % 100 == 0:
            print("Train loss on {}th step is {}".format(step, loss.item()))
            evaluate(model, eval_dataloader, tokenizer)
        step += 1

## Make some predictions

In [45]:
batch = next(iter(eval_dataloader))
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model.generate(batch['input_ids'])

print(tokenizer.decode(outputs[0], skip_special_tokens=True))
print(tokenizer.decode(batch['labels'][0], skip_special_tokens=True))
print(tokenizer.decode(batch['input_ids'][0], skip_special_tokens=True))

A wants to get a puppy for her son. He will take him to the animal shelter
A will go to the animal shelter tomorrow to get a puppy for her son. They already visited the shelter last Monday and the son chose the puppy.
summarize: A: Hi Tom, are you busy tomorrow’s afternoon? B: I’m pretty sure I am. What’s up? A: Can you go with me to the animal shelter?. B: What do you want to do? A: I want to get a puppy for my son. B: That will make him so happy. A: Yeah, we’ve discussed it many times. I think he’s ready now. B: That’s good. Raising a dog is a tough issue. Like having a baby ;-) A: I'll get him one of those little dogs. B: One that won't grow up too big;-) A: And eat too much;-)) B: Do you know which one he would like? A: Oh, yes, I took him there last Monday. He showed me one that he really liked. B: I bet you had to drag him away. A: He wanted to take it home right away ;-). B: I wonder what he'll name it. A: He said he’d name it after his dead hamster – Lemmy - he's a great Motorh