In [None]:
!pip install transformers datasets accelerate torch


In [None]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")


In [None]:
from transformers import T5Tokenizer

# Initialize the tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")

def preprocess_data(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True,padding="max_length")

    # Setup the tokenizer for the targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=128, truncation=True,padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to the dataset
tokenized_datasets = dataset.map(preprocess_data, batched=True, num_proc=4)


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
import torch
model = T5ForConditionalGeneration.from_pretrained("t5-base")


In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-summarization",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True, 
    logging_dir="./logs",
    logging_steps=500,
)

# Define data collator
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].select(range(int(0.5*len(dataset["train"])))),
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
trainer.train()


In [3]:
!pip install nltk rouge_score






[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import evaluate 
from transformers import pipeline 
from datasets import load_from_disk 
 
# Load the fine-tuned model 
summarizer = pipeline("summarization", model="./t5-summarization", device=0) 
rouge = evaluate.load("rouge") 
 
dataset =load_from_disk("./tokenized_cnn_dailymail/test/") 
dat=dataset.select(range(8000)) 
 
generated_summaries = [] 
reference_summaries = [] 
 
for sample in dat: 
    generated_summary = summarizer(sample["article"], max_length=74, min_length=20, num_beams=4) 
    generated_summaries.append(generated_summary[0]["summary_text"]) 
    reference_summaries.append(sample["highlights"]) 
 
# Compute ROUGE 
results = rouge.compute(predictions=generated_summaries, references=reference_summaries) 
 
# Display ROUGE scores 
for key, value in results.items(): 
    print(f"{key}: {value}")

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Device set to use 0


rouge1: 0.5333333333333332
rouge2: 0.35616438356164376
rougeL: 0.45333333333333337
rougeLsum: 0.45333333333333337


In [None]:
model.save_pretrained("./ft5-summarization")
tokenizer.save_pretrained("./t5-summarization")