In [1]:
!pip install datasets nltk spacy scikit-learn transformers streamlit



In [2]:
# ✅ Step 2: Import Libraries
import nltk
import spacy
import numpy as np
import networkx as nx
import pandas as pd
from datasets import load_dataset
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline


In [3]:
!pip install pyngrok
from pyngrok import ngrok



In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
import datasets
dataset = datasets.load_dataset("cnn_dailymail", "3.0.0") # extend the timeout period to 10 mins
articles = dataset["train"]["article"][:100]  # Using only 100 articles for processing
highlights = dataset["train"]["highlights"][:100]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using the latest cached version of the dataset since cnn_dailymail couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration '3.0.0' at /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/0.0.0/96df5e686bee6baa90b8bee7c28b81fa3fa6223d (last modified on Sun Mar 30 10:25:27 2025).


In [6]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
# ✅ Step 4: Preprocessing Function
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words("english")]
    return " ".join(tokens)

articles_cleaned = [preprocess_text(article) for article in articles]


In [8]:
# ✅ Step 6: POS Tagging & Chunking
nlp_doc = nlp(articles_cleaned[0])
chunks = [chunk.text for chunk in nlp_doc.noun_chunks]
print("Extracted Chunks:", chunks[:10])

Extracted Chunks: ['london england reuters harry potter star daniel radcliffe gains access', 'million million fortune', '18 monday', 'money', 'spell daniel radcliffe harry potter harry potter order phoenix disappointment gossip columnists', 'world young actor', 'fritter cash', 'fast cars', 'celebrity parties', 'one people']


In [9]:
# ✅ Step 7: Extractive Summarization using TextRank
def text_rank_summary(text, num_sentences=3):
    sentences = sent_tokenize(text)
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)
    sim_matrix = cosine_similarity(X, X)
    graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(graph)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    return " ".join([s[1] for s in ranked_sentences[:num_sentences]])

summary = text_rank_summary(articles[0])
print("Extractive Summary:\n", summary)

Extractive Summary:
 Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. His latest outing as the boy wizard in "Harry Potter and the Order of the Phoenix" is breaking records on both sides of the Atlantic and he will reprise the role in the last two films. Meanwhile, he is braced for even closer media scrutiny now that he's legally an adult: "I just think I'm going to be more sort of fair game," he told Reuters.


In [10]:
# ✅ Step 8: Abstractive Summarization using BART
summarizer = pipeline("summarization")
abstractive_summary = summarizer(articles[0], max_length=100, min_length=30, do_sample=False)
print("Abstractive Summary:\n", abstractive_summary[0]['summary_text'])

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Abstractive Summary:
  Harry Potter star Daniel Radcliffe turns 18 on Monday, gaining access to a reported $41.1 million fortune . The young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties . Radcliffe's earnings from the first five Potter films have been held in a trust fund .


In [11]:
with open("app.py", "w", encoding="utf-8") as f:
    f.write('''
import streamlit as st
from transformers import pipeline

def summarize_text(text):
    summarizer = pipeline("summarization")
    summary = summarizer(text, max_length=100, min_length=30, do_sample=False)
    return summary[0]['summary_text']

st.title("NLP Text Summarization")
text_input = st.text_area("Enter the text you want to summarize:")

if st.button("Summarize"):
    summary = summarize_text(text_input)
    st.write("Generated Summary: " + summary)
''')


In [12]:
!streamlit run app.py &>/dev/null &
!ngrok config add-authtoken <your_correct_authtoken>
public_url = ngrok.connect(addr="8501")
print("Public URL:", public_url)

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Public URL: NgrokTunnel: "https://ba80-34-75-65-241.ngrok-free.app" -> "http://localhost:8501"
