<a href="https://colab.research.google.com/github/atharva0300/BE-8th-Semester/blob/main/nlp_mini_project/news_article_summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install newspaper3k

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl (211 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/211.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m204.8/211.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.1/211.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.2-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.6/97.6 kB[0m [31m13.2 MB/s[0m 

In [2]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Tokenize each sentence into words, remove stopwords, and perform stemming
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    preprocessed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words = [stemmer.stem(word) for word in words if word not in stop_words]
        preprocessed_sentences.append(" ".join(words))

    return preprocessed_sentences

def textrank_summarizer(text, num_sentences=3):
    # Preprocess the text
    preprocessed_sentences = preprocess_text(text)

    # Create TF-IDF matrix
    tfidf = TfidfVectorizer().fit_transform(preprocessed_sentences)

    # Calculate similarity matrix
    similarity_matrix = cosine_similarity(tfidf, tfidf)

    # Create graph using similarity matrix
    graph = nx.from_numpy_array(similarity_matrix)

    # Calculate PageRank scores
    scores = nx.pagerank(graph)

    # Sort sentences by PageRank scores
    ranked_sentences = sorted(((scores[i], sentence) for i, sentence in enumerate(preprocessed_sentences)), reverse=True)

    # Get top N sentences for summary
    summary_sentences = [sentence for score, sentence in ranked_sentences[:num_sentences]]

    # Reorder the summary sentences based on their order in the original text
    summary = " ".join([preprocessed_sentences[preprocessed_sentences.index(sent)] for sent in summary_sentences])

    return summary



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/97.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-23.12.11-py3-none-any.whl (6.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: breadability, docopt
  Building wheel for breadability (setup.py) ... [?25l[?25hdone
  Created wheel for breadability: filename=breadability-0.1

In [4]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

# Function to summarize a news article using LexRank
def summarize_news_article(text, language='english', sentences_count=3):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer = LexRankSummarizer()

    # Summarize the article
    summary = summarizer(parser.document, sentences_count)
    summary_text = ' '.join([str(sentence) for sentence in summary])

    return summary_text


In [5]:
pip install transformers




In [6]:
from transformers import pipeline

# Load the summarization pipeline
#summarizer = pipeline("summarization")

In [7]:
from transformers import BartTokenizer, BartForConditionalGeneration
import textwrap

# Load the tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [8]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load pre-trained T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
pip install reportlab

Collecting reportlab
  Downloading reportlab-4.1.0-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: reportlab
Successfully installed reportlab-4.1.0


#Test the model from here

In [13]:
url = input("Enter the url of the news article webpage : \n")

Enter the url of the news article webpage : 
https://indianexpress.com/article/cities/pune/maharashtra-madha-lok-sabha-seat-rebellion-9237820/


In [14]:
# obtain the news paper
from newspaper import Article

def extract_article(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text

article_text = extract_article(url)
print(article_text)

BJP leader Vijaysinh Mohite-Patil and Ramraje Naik Nimbalkar, who is with the Ajit Pawar-led NCP, have openly revolted against Ranjitsinh Naik Nimbalkar, the Mahayuti candidate for the Madha Lok Sabha seat.

In signs of impending trouble for the ruling Mahayuti alliance in Maharashtra, the opposition to the BJP candidate for the Madha constituency, Ranjitsinh Naik Nimbalkar, is growing ahead of the Lok Sabha elections. Two former aides of Sharad Pawar, one with the BJP and another with the Ajit Pawar-led Nationalist Congress Party (NCP), have openly revolted against Nimbalkar and are reportedly planning to rejoin the senior Pawar.

The Madha Lok Sabha constituency is spread across the districts of Solapur and Satara. It was represented by Sharad Pawar in 2009 and his associate Vijaysinh Mohite-Patil, who is now with the BJP, in 2014. However, in 2019, BJP’s Nimbalkar won the seat. This year, the BJP has once again given candidature to the sitting MP despite opposition from within the p

In [15]:
# Output of Textrank algorithm

textrank_summary = textrank_summarizer(article_text)
textrank_summary_list = textrank_summary.split(' ')
output_text = ""
for i in range(len(textrank_summary_list)) :
  if(i%30==0 and i!=0 ) :
    print('\n')
    output_text += "\n"
  else :
    output_text += textrank_summary_list[i] + " "
    print(textrank_summary_list[i] , end = " ")

textrank_output = output_text

bjp leader vijaysinh mohite-patil ramraj naik nimbalkar , ajit pawar-l ncp , openli revolt ranjitsinh naik nimbalkar , mahayuti candid madha lok sabha seat . sign impend troubl rule mahayuti 

maharashtra , opposit bjp candid madha constitu , ranjitsinh naik nimbalkar , grow ahead lok sabha elect . year , bjp given candidatur sit mp despit opposit within parti 

mohite-patil alli leader ramraj naik nimbalkar . 

In [16]:
# Summarize the news article
lexrank_summary = summarize_news_article(article_text)
lexrank_summary_list = lexrank_summary.split(' ')
output_text = ""
for i in range(len(lexrank_summary_list)) :
  if(i%30==0 and i!=0 ) :
    print('\n')
    output_text += "\n"
  else :
    output_text += lexrank_summary_list[i] + " "
    print(lexrank_summary_list[i] , end = " ")

lexrank_output = output_text

BJP leader Vijaysinh Mohite-Patil and Ramraje Naik Nimbalkar, who is with the Ajit Pawar-led NCP, have openly revolted against Ranjitsinh Naik Nimbalkar, the Mahayuti candidate for the Madha Lok Sabha 

In signs of impending trouble for the ruling Mahayuti alliance in Maharashtra, the opposition to the BJP candidate for the Madha constituency, Ranjitsinh Naik Nimbalkar, is growing ahead of 

Lok Sabha elections. However, in 2019, BJP’s Nimbalkar won the seat. 

In [17]:
# tokenize
# Wrap the formatted text to 1024 tokens per input (maximum input length for BART)
wrapped_text = textwrap.fill(article_text, width=1024)

# Tokenize the text
inputs = tokenizer(wrapped_text, return_tensors='pt', max_length=1024, truncation=True)

# BART Summary
summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
bart_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)


# Output of BART Transformer

bart_summary_list = bart_summary.split(' ')
output_text = ""

for i in range(len(bart_summary_list)) :
  if(i%30==0 and i!=0 ) :
    print('\n')
    output_text += "\n"
  else :
    output_text += bart_summary_list[i] + " "
    print(bart_summary_list[i] , end = " ")

bart_output = output_text

Vijaysinh Mohite-Patil and Ramraje Naik Nimbalkar have openly revolted against Nimbalkar. the opposition to the BJP candidate for the Madha Lok Sabha seat, Ranjitsinh Naik Nimbalkar, is growing ahead of 

elections. 

In [18]:
# tokenize
# Tokenize the input text
inputs = tokenizer.encode("summarize: " + article_text, return_tensors="pt", max_length=512, truncation=True)

# Generate the summary
summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
t5_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Output of T5 Transformer

t5_summary_list = t5_summary.split(' ')
output_text = ""
for i in range(len(t5_summary_list)) :
  if(i%30==0 and i!=0 ) :
    print('\n')
    output_text += "\n"
  else :
    output_text += t5_summary_list[i] + " "
    print(t5_summary_list[i] , end = " ")

t5_output = output_text


the opposition to the BJP candidate for the Madha constituency, Ranjitsinh Naik Nimbalkar, is growing ahead of the Lok Sabha elections. two former aides of Sharad Pawar, one with the 

and another with the Ajit Pawar-led NCP, have openly revolted against Nimbalkar. 

In [19]:
# Create a pdf and store the outputs in the pdf file

from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from google.colab import files

# Create a PDF file
def create_pdf(file_name, text):
    custom_page_height = 1600  # Custom page height in points
    c = canvas.Canvas(file_name,  pagesize=(letter[0], custom_page_height))
    lines = text.split('\n')
    y_position = 1550  # Starting vertical position
    line_height = 12  # Height of each line
    max_width = 500  # Maximum width of the text

    for line in lines:
        # Calculate the height of the text
        text_width = c.stringWidth(line, "Helvetica", 12)

        # If the text exceeds the maximum width, create a new line
        if text_width > max_width:
            parts = [line[i:i+100] for i in range(0, len(line), 100)]  # Split line into parts
            for part in parts:
                c.drawString(20, y_position, part)
                y_position -= line_height  # Move to the next line
        else:
            c.drawString(20, y_position, line)
            y_position -= line_height  # Move to the next line

    c.save()


# Text to write in the PDF
text = ""
text = "                                                                News Summarizer Report\n\n\n"
text += "------------------------------------------------------------------------------------------------------------------------\n"
text += "Article\n"
text += "------------------------------------------------------------------------------------------------------------------------\n"
text += article_text
text += "\n\n\n"
text += "------------------------------------------------------------------------------------------------------------------------\n"
text += "TextRank Algorithm Output\n"
text += "------------------------------------------------------------------------------------------------------------------------\n"
text += textrank_output
text += "\n\n\n"
text += "------------------------------------------------------------------------------------------------------------------------\n"
text += "LexRank Algorithm Output\n"
text += "------------------------------------------------------------------------------------------------------------------------\n"
text += lexrank_output
text += "\n\n\n"
text += "------------------------------------------------------------------------------------------------------------------------\n"
text += "BART Output\n"
text += "------------------------------------------------------------------------------------------------------------------------\n"
text += bart_output
text += "\n\n\n"
text += "------------------------------------------------------------------------------------------------------------------------\n"
text += "T5 Output\n"
text += "------------------------------------------------------------------------------------------------------------------------\n"
text += t5_output
text += "\n\n\n"

# Create and save the PDF file
file_name = "News_Summarizer_Report.pdf"
create_pdf(file_name, text)

# Download the PDF file
files.download(file_name)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
# After the summary generation from all the 4 models

# 1. Summarize multiple news articles
# 2. Apply evaluation metrics
#  - ROUGE
#  - BLEU
#  - METEOR
