In [None]:
# Install Required Libraries


!pip install transformers
!pip install torch
!pip install datasets
!pip install rouge-score
#We start by installing the necessary libraries. transformers is for accessing pre-trained models, torch is the backend for these models, datasets is for loading and evaluating datasets, and rouge-score is for evaluating the quality of the summaries.




In [None]:
#Load the Model and Tokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')
#We load the T5 model and its tokenizer from Hugging Face’s model repository. The t5-small variant is used here for simplicity.




In [None]:
# Load a Dataset
from datasets import load_dataset

# Load the CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")
#Explanation: We load the CNN/DailyMail dataset, which is commonly used for text summarization tasks. The load_dataset function fetches the dataset and prepares it for use.

In [None]:
# Explore the Dataset


# Display the first example from the training set
print(dataset['train'][0])
#We explore the dataset by displaying the first example from the training set. This helps us understand the structure of the data.


In [None]:
# Prepare the Data for Summarization


# Extract the article and summary
article = dataset['train'][0]['article']
summary = dataset['train'][0]['highlights']
# Tokenize the inputs
inputs = tokenizer.encode("summarize: " + article, return_tensors="pt", max_length=512, truncation=True)

#We extract the article and its corresponding summary from the dataset. Then, we tokenize the article to prepare it for summarization.


In [None]:
# Generate the Summary
summary_ids = model.generate(inputs, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(generated_summary)
# We generate the summary using the model and print it. The parameters control the length and quality of the summary.


In [None]:
# Create a Function for Summarization


def summarize_text(text):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# We create a function summarize_text that takes a text input and returns its summary. This function encapsulates the summarization process, making it reusable.


In [None]:
# Batch Processing for Multiple Articles


articles = ["Article 1 text", "Article 2 text", "Article 3 text"]
summaries = [summarize_text(article) for article in articles]
for i, summary in enumerate(summaries):
    print(f"Summary {i+1}: {summary}")

# We process multiple articles in a batch. This is useful for summarizing a collection of articles at once. We loop through the articles, generate summaries, and print them.



In [None]:
import json

# Save summaries to a file
with open('summaries.json', 'w') as f:
    json.dump(summaries, f)

# Load summaries from a file
with open('summaries.json', 'r') as f:
    loaded_summaries = json.load(f)
print(loaded_summaries)

# We save the generated summaries to a JSON file and load them back. This is useful for storing the summaries for future reference or further processing.
