In [2]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load pre-trained T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# Load dataset for fine-tuning (e.g., CNN/DailyMail dataset)
dataset = load_dataset("cnn_dailymail", "3.0.0")
small_dataset = dataset["train"].select(range(100))  # Select the first 100 examples

Using the latest cached version of the dataset since cnn_dailymail couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration '3.0.0' at C:\Users\TVT\.cache\huggingface\datasets\cnn_dailymail\3.0.0\0.0.0\96df5e686bee6baa90b8bee7c28b81fa3fa6223d (last modified on Mon Apr 22 00:25:23 2024).


In [5]:
# Tokenize dataset for training
def tokenize_function(example):
    source_text = example["article"]
    target_text = example["highlights"]
    source_tokenized = tokenizer(source_text, truncation=True, padding="max_length", max_length=1024, return_tensors="pt")
    target_tokenized = tokenizer(target_text, truncation=True, padding="max_length", max_length=150, return_tensors="pt")
    return {
        "input_ids": source_tokenized.input_ids,
        "attention_mask": source_tokenized.attention_mask,
        "labels": target_tokenized.input_ids,
    }

tokenized_datasets = small_dataset.map(tokenize_function, batched=True)

In [6]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    logging_dir="./logs",
    logging_steps=1000,
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [7]:
# Fine-tune the model
trainer.train()

100%|██████████| 75/75 [16:55<00:00, 13.54s/it]

{'train_runtime': 1015.5293, 'train_samples_per_second': 0.295, 'train_steps_per_second': 0.074, 'train_loss': 4.186572265625, 'epoch': 3.0}





TrainOutput(global_step=75, training_loss=4.186572265625, metrics={'train_runtime': 1015.5293, 'train_samples_per_second': 0.295, 'train_steps_per_second': 0.074, 'train_loss': 4.186572265625, 'epoch': 3.0})

In [8]:
# Save the fine-tuned model
model.save_pretrained("./fine_tuned_t5_small")

In [9]:
# Example of generating summaries using the fine-tuned model
input_text = """Artificial intelligence (AI) is a field of computer science that aims to create systems capable of performing tasks that typically require human intelligence. 
The concept of AI dates back to ancient times, with early ideas emerging in Greek mythology and ancient Greek philosophy. However, the modern era of AI began in the mid-20th 
century with the development of computer technology and the advent of digital computing. In 1956, the term "artificial intelligence" was coined at the Dartmouth Conference,
 where researchers gathered to discuss the potential of creating machines that could mimic human cognitive abilities. Since then, AI has evolved rapidly, with significant 
 advancements in areas such as machine learning, natural language processing, computer vision, and robotics. AI technologies have been applied across various industries, 
 including healthcare, finance, transportation, and entertainment, revolutionizing the way we live and work. From virtual assistants like Siri and Alexa to self-driving cars
   and advanced medical diagnostic systems, AI has become an integral part of our daily lives. However, AI also raises ethical and societal concerns, including issues related
     to privacy, bias, job displacement, and the potential for misuse of AI-powered systems. Despite these challenges, the pursuit of artificial intelligence continues to drive
       innovation and shape the future of technology and society.
"""
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
generated_summary_ids = model.generate(input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
generated_summary = tokenizer.decode(generated_summary_ids[0], skip_special_tokens=True)
print("Generated Summary:")
print(generated_summary)

Generated Summary:
AI (AI) is a field of computer science that aims to create systems capable of performing tasks typically require human intelligence. The concept of AI dates back to ancient times, with early ideas emerging in Greek mythology and ancient Greek philosophy. However, the modern era of AI began in the mid-20th century with the development of computer technology.


In [10]:
from rouge_score import rouge_scorer
reference_summary = input_text

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate ROUGE scores
scores = scorer.score(reference_summary, generated_summary)

# Print ROUGE scores
print("ROUGE-1 F1 Score:", scores['rouge1'].fmeasure)
print("ROUGE-2 F1 Score:", scores['rouge2'].fmeasure)
print("ROUGE-L F1 Score:", scores['rougeL'].fmeasure)

ROUGE-1 F1 Score: 0.4402985074626865
ROUGE-2 F1 Score: 0.4210526315789473
ROUGE-L F1 Score: 0.4328358208955224
