In [1]:
!pip install --quiet lxml_html_clean newspaper3k scikit-learn nltk beautifulsoup4 lxml requests

In [3]:
import nltk, sys, traceback
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [5]:
from newspaper import Article
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import requests
from bs4 import BeautifulSoup
import csv, time

In [7]:
STOP = set(stopwords.words('english'))

In [9]:
urls = [
   
    "https://www.bbc.com/news/articles/cx2ljgrm78zo",       
    "https://www.bbc.com/news/articles/c891p1pez42o",                 
    "https://www.bbc.com/news/articles/czxny96260qo"                              
]

In [11]:
with open("urls.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(urls))
print(" urls.txt created with", len(urls), "entries")

 urls.txt created with 3 entries


In [13]:
def extract_with_newspaper(url):
    try:
        a = Article(url)
        a.download()
        a.parse()
        if a.text and len(a.text.split())>30:
            return a.title or "", a.text
        else:
            return None
    except Exception as e:
        return None

def extract_with_bs(url, timeout=12):
    """Simple BeautifulSoup fallback: try to get article or paragraphs"""
    try:
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
        r = requests.get(url, headers=headers, timeout=timeout)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        # remove scripts/styles
        for tag in soup(["script","style","noscript","header","footer","nav","aside"]):
            tag.decompose()
      
        article_tag = soup.find("article")
        if article_tag:
            text = article_tag.get_text(separator=" ", strip=True)
            if len(text.split())>30:
                title = (soup.title.string or "").strip()
                return title, text
       
        paragraphs = [p.get_text(separator=" ", strip=True) for p in soup.find_all("p")]
        text = "\n".join([p for p in paragraphs if p and len(p.split())>5])
        if len(text.split())>30:
            title = (soup.title.string or "").strip()
            return title, text
        return None
    except Exception as e:
        return None

def clean_text(s):
    s = s.lower()
    tokens = [t for t in word_tokenize(s) if t.isalpha() and t not in STOP and len(t)>2]
    return " ".join(tokens)

In [15]:
collected_titles = []
collected_raws = []
collected_docs = []

for u in urls:
    print("\n--- Processing URL:", u)
    
    out = extract_with_newspaper(u)
    if out:
        title, text = out
        print(" Extracted with newspaper3k; title:", (title[:80]+"...") if title else "(no title)")
    else:
        print(" newspaper3k failed or returned little text for this URL â€” trying BeautifulSoup fallback...")
        out2 = extract_with_bs(u)
        if out2:
            title, text = out2
            print(" Extracted with BeautifulSoup fallback; title:", (title[:80]+"...") if title else "(no title)")
        else:
            print(" Failed to extract usable article text from this URL. Skipping.")
            continue
    # store
    collected_titles.append(title)
    collected_raws.append(text)
    collected_docs.append(clean_text(text))
    time.sleep(1.0) 

print("\n Total collected documents:", len(collected_docs))


--- Processing URL: https://www.bbc.com/news/articles/cx2ljgrm78zo
 Extracted with newspaper3k; title: Trump raises tariffs on Canadian goods in response to Reagan advert...

--- Processing URL: https://www.bbc.com/news/articles/c891p1pez42o
 Extracted with newspaper3k; title: Mystery Trump supporter gives $130m to US military for paying troops during shut...

--- Processing URL: https://www.bbc.com/news/articles/czxny96260qo
 Extracted with newspaper3k; title: Lassie and Lost in Space actress June Lockhart dies aged 100...

 Total collected documents: 3


In [17]:
if len(collected_docs) == 0:
    print("\n No documents were collected. Common reasons:\n"
          " - URLs point to landing pages (not article pages)\n"
          " - The site uses JavaScript to render article text (newspaper/requests can't see it)\n"
          " - The site blocks scrapers or requires a login\n\nAdvice:\n - Replace URLs with direct article links from BBC/Reuters/NYTimes (public),\n - OR manually copy article text into a local .txt file and process that.\n")
    raise SystemExit("No documents to process.")

In [19]:
try:
    vect = TfidfVectorizer(max_df=0.85, min_df=1, max_features=5000)
    X = vect.fit_transform(collected_docs)
    n_topics = min(4, max(1, len(collected_docs)))  # safe topic count
    nmf = NMF(n_components=n_topics, random_state=0, init='nndsvda', max_iter=300)
    nmf.fit(X)
    H = nmf.components_
    feat = vect.get_feature_names_out()
    topics = [[feat[i] for i in row.argsort()[::-1][:8]] for row in H]
    print("\n Topics discovered:")
    for i,t in enumerate(topics,1):
        print(f"  Topic {i}: {', '.join(t)}")
except Exception as e:
    print(" Topic discovery failed:", e)
    traceback.print_exc()


 Topics discovered:
  Topic 1: trade, canadian, said, trump, canada, tariffs, advert, goods
  Topic 2: lockhart, lassie, actress, space, award, lost, film, role
  Topic 3: military, trump, troops, donor, shutdown, make, pay, paid


In [21]:
def summarize_article(text, vect, top_n=3):
    sents = sent_tokenize(text)
    if len(sents) <= top_n:
        return " ".join(sents)
    tf = TfidfVectorizer(vocabulary=vect.vocabulary_)
    S = tf.fit_transform([clean_text(s) for s in sents])
    scores = S.sum(axis=1).A1
    top_idx = sorted(scores.argsort()[-top_n:])
    return " ".join([sents[i] for i in top_idx])

In [23]:
with open("summaries.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["title", "summary"])
    for title, raw in zip(collected_titles, collected_raws):
        try:
            summ = summarize_article(raw, vect)
        except Exception as e:
            summ = ""
        writer.writerow([title, summ])

print("\n Summaries saved to summaries.csv")


 Summaries saved to summaries.csv


In [27]:
def summarize_article(text, vect, top_n=3):
    sents = sent_tokenize(text)
    if len(sents) <= top_n:
        return " ".join(sents)
    tf = TfidfVectorizer(vocabulary=vect.vocabulary_)
    S = tf.fit_transform([clean_text(s) for s in sents])
    scores = S.sum(axis=1).A1
    top_idx = sorted(scores.argsort()[-top_n:])
    return " ".join([sents[i] for i in top_idx])

In [31]:
with open("article_summaries.txt", "w", encoding="utf-8") as f:
    for title, raw in zip(collected_titles, collected_raws):
        try:
            summ = summarize_article(raw, vect)
        except Exception as e:
            summ = "( Could not summarize this article.)"
        f.write(f" Title: {title}\n")
        f.write(f"Summary:\n{summ}\n")
        f.write("-"*80 + "\n")

print("\nSummaries saved to 'article_summaries.txt'")


Summaries saved to 'article_summaries.txt'
