# Importing Libraries

In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import DataLoader, Dataset
import torch
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# Importing Dataset and Preprocessing it

In [2]:
df=pd.read_csv('news_summary.csv',encoding='latin-1')
df=df.head(10)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   author     10 non-null     object
 1   date       10 non-null     object
 2   headlines  10 non-null     object
 3   read_more  10 non-null     object
 4   text       10 non-null     object
 5   ctext      10 non-null     object
dtypes: object(6)
memory usage: 612.0+ bytes


In [4]:
null_values = df.isnull().sum()
null_values

author       0
date         0
headlines    0
read_more    0
text         0
ctext        0
dtype: int64

In [5]:
columns_of_interest = ['text', 'headlines']
df = df[columns_of_interest]

In [6]:
df.head()

Unnamed: 0,text,headlines
0,The Administration of Union Territory Daman an...,Daman & Diu revokes mandatory Rakshabandhan in...
1,Malaika Arora slammed an Instagram user who tr...,Malaika slams user who trolled her for 'divorc...
2,The Indira Gandhi Institute of Medical Science...,'Virgin' now corrected to 'Unmarried' in IGIMS...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Aaj aapne pakad liya: LeT man Dujana before be...
4,Hotels in Maharashtra will train their staff t...,Hotel staff to get training to spot signs of s...


# Defining Functions 

In [7]:
class TextSummarizer:
    def __init__(self):
        self.model_name = "t5-small"
        self.model = T5ForConditionalGeneration.from_pretrained(self.model_name)
        self.tokenizer = T5Tokenizer.from_pretrained(self.model_name)
    
    def summarize(self, text, max_length=150, min_length=30):
        inputs = self.tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
        summary_ids = self.model.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary

class CustomDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_length=512):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = self.summaries[idx]

        inputs = self.tokenizer.encode_plus(
            "summarize: " + text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        labels = self.tokenizer.encode(
            summary,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].flatten(),
            "attention_mask": inputs["attention_mask"].flatten(),
            "labels": labels.flatten()
        }

def train(model, train_loader, optimizer, device, epochs=3):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}")


# Initializing Everthing

In [8]:
texts=df['text'].tolist()
summaries=df['headlines'].tolist()

In [9]:
# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
# Initialize dataset and dataloader
dataset = CustomDataset(texts, summaries, tokenizer)
train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

#  Training

In [11]:
# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
train(model, train_loader, optimizer, device)

Epoch 1/3, Loss: 15.802435684204102
Epoch 2/3, Loss: 12.76302890777588
Epoch 3/3, Loss: 9.553829669952393


# Saving Model

In [12]:
# Save the trained model
save_path = "./outputs/t5_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("Model saved successfully.")

Model saved successfully.


# User Input summarization test

In [26]:
# Summarize text
summarizer = TextSummarizer()
Original_text ="Amidst the vast expanse of the Mariana Trench, scientists have made a monumental breakthrough, uncovering a previously unknown species of deep-sea jellyfish, shedding light on the mysteries of the ocean depths and underscoring the critical need for continued exploration and conservation efforts. This remarkable discovery, occurring at a depth of over 10,000 meters below the ocean surface, represents a significant milestone in marine biology and highlights the resilience and adaptability of life in one of the Earth's most extreme environments.The newly discovered species, tentatively named Mariana Medusa, possesses unique characteristics that set it apart from any known jellyfish species. With its ethereal translucence and delicate tendrils extending gracefully from its bell-shaped body, the Mariana Medusa captivates the imagination and challenges conventional understanding of deep-sea life. Initial observations suggest that this enigmatic creature thrives in the extreme pressures and near-freezing temperatures of the deep ocean, carving out a niche in a habitat largely untouched by human presence.The significance of this discovery extends beyond its scientific value, offering a glimpse into the intricate web of life that exists beneath the ocean's surface. As researchers continue to study the Mariana Medusa and its habitat, they hope to unlock secrets that could inform our understanding of evolutionary biology, ecology, and even biomedical research. Furthermore, the discovery underscores the importance of preserving these pristine marine environments, which are increasingly threatened by human activities such as deep-sea mining, pollution, and climate change.In recent years, technological advancements have enabled scientists to explore the ocean depths with unprecedented precision and detail. From remotely operated vehicles (ROVs) equipped with high-definition cameras to autonomous underwater vehicles (AUVs) capable of mapping vast swaths of the seafloor, these tools have revolutionized our ability to study and document life in the deep sea. However, the Mariana Trench remains one of the least explored and understood ecosystems on the planet, presenting both challenges and opportunities for future research.Despite the inherent difficulties of working in such extreme environments, scientists remain undeterred in their quest to unlock the secrets of the deep. Collaborative efforts involving researchers from around the world are essential to advancing our knowledge of deep-sea ecosystems and informing conservation strategies aimed at protecting these fragile habitats. By combining cutting-edge technology with interdisciplinary approaches, scientists can continue to push the boundaries of exploration and make new discoveries that enrich our understanding of the natural world.As we celebrate this groundbreaking discovery, it serves as a reminder of the boundless wonders that await discovery beneath the ocean's surface. From the hidden depths of the Mariana Trench to the coral reefs teeming with life, the ocean remains a source of fascination and inspiration for scientists and explorers alike. By fostering curiosity, collaboration, and conservation efforts, we can ensure that future generations inherit a world where the mysteries of the deep continue to unfold."

summary = summarizer.summarize(Original_text)
print(Original_text)
print("\nSummary:", summary)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Amidst the vast expanse of the Mariana Trench, scientists have made a monumental breakthrough, uncovering a previously unknown species of deep-sea jellyfish, shedding light on the mysteries of the ocean depths and underscoring the critical need for continued exploration and conservation efforts. This remarkable discovery, occurring at a depth of over 10,000 meters below the ocean surface, represents a significant milestone in marine biology and highlights the resilience and adaptability of life in one of the Earth's most extreme environments.The newly discovered species, tentatively named Mariana Medusa, possesses unique characteristics that set it apart from any known jellyfish species. With its ethereal translucence and delicate tendrils extending gracefully from its bell-shaped body, the Mariana Medusa captivates the imagination and challenges conventional understanding of deep-sea life. Initial observations suggest that this enigmatic creature thrives in the extreme pressures and n

# Text Extractive Summarization using TF-IDF

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import bs4 as bs
import urllib.request
import re
import nltk

In [28]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
text=Original_text
ext = re.sub(r'\[\d+\]', '', text)

# Normalize spaces

text = re.sub(r'\s+', ' ', text).strip()

In [30]:
text = re.sub(r'\s*\[citation needed\]\s*', '', text)
text = re.sub(r'\s+', ' ',text).strip()
# Tokenizing sentences

sentence_list = nltk.sent_tokenize(text)

In [31]:
# Initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')
# Generate the tf-idf vectors for the sentences

tfidf_matrix = tfidf_vectorizer.fit_transform(sentence_list)
# Calculate the sum of tf-idf scores for each sentence

sentence_scores = np.sum(tfidf_matrix.toarray(), axis = 1)
# Rank sentences by score

top_sentence_indices = np.argsort(sentence_scores)[::-1][:10]


In [35]:
# Extract top sentences in order
summary_sentences = [sentence_list[idx] for idx in top_sentence_indices[:3]]
# Joining the selected sentences
s = ' '.join(summary_sentences)
print(Original_text)
print("\nSummary :",s)

Amidst the vast expanse of the Mariana Trench, scientists have made a monumental breakthrough, uncovering a previously unknown species of deep-sea jellyfish, shedding light on the mysteries of the ocean depths and underscoring the critical need for continued exploration and conservation efforts. This remarkable discovery, occurring at a depth of over 10,000 meters below the ocean surface, represents a significant milestone in marine biology and highlights the resilience and adaptability of life in one of the Earth's most extreme environments.The newly discovered species, tentatively named Mariana Medusa, possesses unique characteristics that set it apart from any known jellyfish species. With its ethereal translucence and delicate tendrils extending gracefully from its bell-shaped body, the Mariana Medusa captivates the imagination and challenges conventional understanding of deep-sea life. Initial observations suggest that this enigmatic creature thrives in the extreme pressures and n