Install all dependencies

In [None]:
!pip install evaluate pillow==9.0.0
!pip install pytesseract transformers datasets rouge-score nltk py7zr --upgrade

In [30]:
!pip install accelerate -U



Load and prepare the dataset

In [1]:
dataset_id="samsum"

from datasets import load_dataset

dataset=load_dataset(dataset_id)

print("Train dataset size:",len(dataset['train']))
print("Test dataset size:",len(dataset['test']))

Train dataset size: 14732
Test dataset size: 819


In [2]:
from random import randrange

sample=dataset['train'][randrange(len(dataset['train']))]

print(f"dialouge \n ",sample['dialogue'])
print(f"dialouge \n ",sample['summary'])

dialouge 
  Eliza: i listened to what you sent me yesterday
Eliza: so beautiful, i was literally crying
Leo: woah!
Leo: seriously?
Eliza: yes, i experienced it very emotionally
Leo: i'm really happy you liked it so much :O
Leo: <3
Eliza: what were you thinking about when you were composing it?
Leo: hmm maybe i'll tell you later :)
Eliza: okay
dialouge 
  Eliza was deeply moved after listening to the music piece composed by Leo.


#Choose the model to tune - dont choose t5 small

In [3]:
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM

model_id="google/flan-t5-base"

tokenizer=AutoTokenizer.from_pretrained(model_id)

#preprocess the Dataset

In [4]:
from datasets import concatenate_datasets

#longest senetence will be trucated , shortest will be padded
tokenized_inputs=concatenate_datasets([dataset["train"],dataset["test"]]).map(lambda x:tokenizer(x["dialogue"]))
max_source_length=max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"max source length {max_source_length}")


max source length 1153


In [5]:
#perform the same for targets as well

tokenized_targets=concatenate_datasets([dataset["train"],dataset["test"]]).map(lambda x:tokenizer(x["summary"]))
max_target_length=max([len(x) for x in tokenized_targets["input_ids"]])
print(f"max source length {max_source_length}")

max source length 1153


In [6]:
def preprocess_function(sample,padding='max_length'):

  #add prefix to the input for t5
  inputs=["summarise" + item for item in sample["dialogue"]]

  # tokenize the inputs
  model_inputs=tokenizer(inputs,
                         max_length=max_source_length,
                         padding=padding,
                         truncation=True)

  #tokenize target labels
  labels=tokenizer(text_target=sample["summary"],
                    max_length=max_source_length,
                    padding=padding,
                    truncation=True)

  if padding=="max_length":
    labels["input_ids"]=[
        [(l if l!= tokenizer.pad_token_id else -100) for l in label] for  label in labels["input_ids"]
        ]
    model_inputs["labels"]=labels["input_ids"]

  return model_inputs

#1000 default batch removing columns as we need only selected columns
tokenized_dataset=dataset.map(preprocess_function,batched=True,remove_columns=["dialogue","summary",'id'])
print(f"key of tokenized dataset: {list(tokenized_dataset['train'].features)}")


Map:   0%|          | 0/818 [00:00<?, ? examples/s]

key of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


#FineTune the dataset

In [7]:
from transformers  import AutoModelForSeq2SeqLM

model=AutoModelForSeq2SeqLM.from_pretrained(model_id)


#Hugging face trainer class to evaluate during training

In [8]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")
'''
ROUGE (Recall-oriented Understudy for Gisting Evaluation) is a set of metrics
commonly used for evaluating the quality of automatic summarization and
text generation systems. It measures the overlap between the generated
summary and one or more reference summaries.
ROUGE scores are based on the concept of precision, recall, and Fl score.
There are several variants of ROUGE metrics, such as ROUGE-N. ROUGE-L
and ROUGE-S.
'''

metric=evaluate.load("rouge")

#helper function

def post_process(preds,labels):
  pred=[pred.strip() for pred in preds]
  labels=[label.strip() for label in labels]

  preds=["\n".join(sent_tokenize(pred)) for pred in preds]
  labels=["\n".join(sent_tokenize(label)) for label in labels]

  return preds,labels

def compute_metrics(eval_preds):
  preds,labels=eval_preds
  if isinstance(preds,tuple):
    preds=preds[0]

  decoded_preds=tokenizer.batch_decode(preds,skip_special_tokens=True)

  labels=np.where(labels !=-100 ,labels,tokenizer.pad_token_id)
  decoded_labels=tokenizer.batch_decode(preds,skip_special_tokens=True)

  decoded_preds,decoded_labels=post_process(decoded_preds,decoded_labels)

  result=metric.compute(predictions=decoded_preds,references=decoded_labels,use_stemmer=True)
  result={k: round(v*100,4) for k ,v in result.items()}
  prediction_lens=[np.count_nonzero(pred!=tokenizer.pad_token_id) for pred in preds]
  result["gen len"]=np.mean(prediction_lens)
  return result



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
from transformers import DataCollatorForSeq2Seq

lable_pad_token_id=-100

data_collator=DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=lable_pad_token_id,
    pad_to_multiple_of=8
)

In [10]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir="test",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=True, # 50% precession for running in collab
    learning_rate=5e-5,
    num_train_epochs=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="overall_f1",

)

#additional parameter for logging and pushing the same to huggingface and using tensfor board

    # logging & evaluation strategies
    # logging_dir=f"{repository_id}/logs",
    # logging_strategy="steps",
    # logging_steps=500,

    # push to hub parameters
    # report_to="tensorboard",
    # push_to_hub=False,
    # hub_strategy="every_save",
    # hub_model_id=repository_id,
    # hub_token=HfFolder.get_token(),

#additional parameter for logging and pushing the same to huggingface and using tensfor board

#define the trainer class to train the model

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)



#Start Training

In [None]:
# Start training for dialouge text summarization
trainer.train()

In [None]:
#evaluate trained model

trainer.evaluate()

In [None]:
# Save our tokenizer and create model card

model_path="your model path"
tokenizer.save_pretrained(model_path)


# #only for hugging face use
# trainer.create_model_card()

# # Push the results to the hub
# trainer.push_to_hub()

#Run Inference

In [None]:
from transformers import pipeline
from random import randrange

# load model and tokenizer from huggingface hub with pipeline
summarizer = pipeline("summarization", model="philschmid/flan-t5-base-samsum", device=0)

# select a random test sample
sample = dataset['test'][randrange(len(dataset["test"]))]
print(f"dialogue: \n{sample['dialogue']}\n---------------")

# summarize dialogue
res = summarizer(sample["dialogue"])

print(f"flan-t5-base summary:\n{res[0]['summary_text']}")