# Installing required files

In [None]:
%pip install sacremoses==0.0.53
%pip install datasets
%pip install transformers
%pip install torch torchvision torchaudio
%pip install datasets
%pip install nltk
%pip install rouge-score

# Importing required files

In [None]:
import nltk
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import pipeline
from rouge_score import rouge_scorer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.translate.meteor_score import meteor_score
from google.colab import drive
from datasets import load_metric
nltk.download('punkt')

# Mount Google Drive

In [None]:
drive.mount('/content/drive')

# Define the path to the CSV file

In [142]:
path_to_file = '/content/drive/MyDrive/sampletrain.csv'

# Load dataset from CSV

In [None]:
xsum_dataset = load_dataset(
    'csv',
    data_files=path_to_file,
    column_names=["document", "summary"],
    cache_dir='/Documents/Huggin_Face/data'
)

# Select a sample of the dataset

In [None]:
xsum_sample = xsum_dataset["train"].select(range(1,11))

# Display the sample data

In [None]:
display(xsum_sample.to_pandas())

# Create a summarization pipeline using t5-small

In [144]:
summarizer = pipeline(
    task="summarization",
    model="t5-small",
    truncation=True,
    model_kwargs={"cache_dir": '/Documents/Huggin_Face/'},
)


# Load the tokenizer for the summarization model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Tokenize Input Text and Count Tokens in Example

In [None]:
example_index = 0

input_text = xsum_sample["document"][example_index]

input_tokens = tokenizer.tokenize(input_text)

num_tokens = len(input_tokens)

print("Tokenized Input:", input_tokens)

print("Number of Tokens:", num_tokens)


# Generate a Summary using T5-Small Model with Specified Length Constraints

In [146]:
input_text = xsum_sample["document"][example_index]

generated_summary = summarizer(input_text, max_length=50, min_length=20, do_sample=False)[0]['summary_text']

# Calculate ROUGE score

In [147]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

reference_summary = xsum_sample["summary"][example_index]

scores = scorer.score(generated_summary, reference_summary)

rouge1_scores = scores['rouge1'].fmeasure
rougeL_scores = scores['rougeL'].fmeasure

#Tokenize reference and generated summaries, then calculate METEOR score

In [148]:
reference_tokens = [word_tokenize(sent) for sent in sent_tokenize(xsum_sample["summary"][example_index])]
generated_tokens = [word_tokenize(sent) for sent in sent_tokenize(generated_summary)]

reference_tokens_flat = [token for sent in reference_tokens for token in sent]
generated_tokens_flat = [token for sent in generated_tokens for token in sent]

meteor = meteor_score([reference_tokens_flat], generated_tokens_flat)

# Print Evaluation Results

Print the generated summary, ROUGE-1 F1 score, ROUGE-L F1 score, and METEOR score.

In [None]:
print("Generated Summary:", generated_summary)

print("ROUGE-1 F1:", rouge1_scores)

print("ROUGE-L F1:", rougeL_scores)

print("METEOR Score:", meteor)

# Conclusion

In brief, the provided code represents a sequence-to-sequence (s2s) model with an attention mechanism. It leverages the t5-small model to generate summaries for input documents. Evaluation metrics, including ROUGE and METEOR scores, help us to calculate the quality of the generated summaries against benchmark summaries. This approach is a common technique in natural language processing tasks such as text summarization.

# By: Aditya Acharya