In [11]:
SOURCES = {
    "economic_times": {
        "rss": "https://economictimes.indiatimes.com/rssfeedstopstories.cms",
        "categories": ["Business", "Technology"]
    },
    "times_of_india": {
        "rss": "https://timesofindia.indiatimes.com/rssfeedstopstories.cms",
        "categories": ["General News", "Business"] 
    },
    "techcrunch": {
        "rss": "https://techcrunch.com/feed/",
        "categories": ["Technology", "Startups"]
    }
}


# News Article Processing Pipeline

This notebook implements a pipeline for:
1. Fetching news articles from multiple RSS feeds
2. Extracting full article content
3. Generating article summaries using LLM
4. Detecting and removing duplicate articles

## Components
- RSS Feed Processing: feedparser
- Content Extraction: newspaper3k
- Summarization: OpenAI/Grok via OpenRouter
- Deduplication: TF-IDF with cosine similarity

# 1. Configuration and Imports

In [12]:
# Basic imports
import feedparser
from datetime import datetime
import pandas as pd
import numpy as np
import time
import json

# Content extraction
from newspaper import Article

# ML/NLP imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Environment and API configuration
from dotenv import load_dotenv
import os
from openai import OpenAI

# Load environment variables
load_dotenv()

# Configure OpenAI client with Openrouter
client = OpenAI(
    api_key=os.getenv("OPENROUTER_API_KEY"),
    base_url="https://openrouter.ai/api/v1",
    default_headers={
        "HTTP-Referer": "https://github.com/ajay-manwani/news_extraction",
        "X-Title": "News Extraction Project"
    }
)

# 2. News Sources Configuration

# News Article Processing Pipeline

This notebook implements a pipeline for:
1. Fetching news articles from multiple RSS feeds
2. Extracting full article content
3. Generating article summaries using LLM
4. Detecting and removing duplicate articles

The pipeline uses:
- feedparser for RSS feed processing
- newspaper3k for article content extraction
- OpenAI/Grok for article summarization
- TF-IDF and cosine similarity for deduplication

In [13]:
from dotenv import load_dotenv
import os
from openai import OpenAI
import json

# Load environment variables
load_dotenv()

# Configure OpenAI client with Openrouter
client = OpenAI(
    api_key=os.getenv("OPENROUTER_API_KEY"),
    base_url="https://openrouter.ai/api/v1",
    default_headers={
        "HTTP-Referer": "https://github.com/ajay-manwani/news_extraction",
        "X-Title": "News Extraction Project"
    }
)

In [23]:
def summarize_article(text, max_tokens=300):
    """
    Summarize article text using x-ai/grok-4-fast model via Openrouter
    
    Args:
        text (str): The article text to summarize
        max_tokens (int): Maximum length of the summary
        
    Returns:
        str: Summarized text
    """
    try:
        # Construct the prompt
        prompt = f"""Please provide a concise summary of the following article. 
        Focus on the main points and key information:

        {text}

        Summary:"""
        
        response = client.chat.completions.create(
            model="x-ai/grok-4-fast:free",
            messages=[
                {"role": "user", "content": prompt}
            ],
            #max_tokens=max_tokens
        )
        
        return response.choices[0].message.content.strip()
    
    except Exception as e:
        print(f"Error in summarization: {str(e)}")
        return "Error generating summary"

In [15]:
def fetch_rss_feed(source_name, source_info):
    """
    Fetch and parse RSS feed from a given source
    
    Args:
        source_name (str): Name of the source
        source_info (dict): Dictionary containing RSS feed URL and categories
        
    Returns:
        list: List of dictionaries containing parsed news items
    """
    feed = feedparser.parse(source_info['rss'])
    
    news_items = []
    for entry in feed.entries:
        news_item = {
            'source': source_name,
            'title': entry.get('title', ''),
            'link': entry.get('link', ''),
            'published': entry.get('published', ''),
            'summary': entry.get('summary', ''),
            'categories': source_info['categories']
        }
        news_items.append(news_item)
    
    return news_items

# Test Article Summarization
Let's test our summarization function on a sample article and compare the original text with the summary.

In [16]:
# Let's test with one source first
source_name =  "times_of_india" #"economic_times" #"techcrunch"
news_items = fetch_rss_feed(source_name, SOURCES[source_name])

# Convert to DataFrame for better visualization
df = pd.DataFrame(news_items)
df.head()

Unnamed: 0,source,title,link,published,summary,categories
0,times_of_india,'GST Bachat Utsav': PM Modi writes open letter...,https://timesofindia.indiatimes.com/india/gst-...,"Mon, 22 Sep 2025 17:10:45 +0530",Prime Minister Modi announced the 'GST Bachat ...,"[General News, Business]"
1,times_of_india,"Appointed, ignored & unpaid: Meet India's 'inv...",https://timesofindia.indiatimes.com/india/appo...,"Mon, 22 Sep 2025 01:02:13 +0530","Across India, tens of thousands of teachers wo...","[General News, Business]"
2,times_of_india,'Main bhi Bharat hoon': Rajnath Singh says 'Po...,https://timesofindia.indiatimes.com/india/main...,"Mon, 22 Sep 2025 15:29:01 +0530",Defence Minister Rajnath Singh addressed the I...,"[General News, Business]"
3,times_of_india,Did Pak bomb a Khyber Pakhtunkhwa village and ...,https://timesofindia.indiatimes.com/world/paki...,"Mon, 22 Sep 2025 14:27:06 +0530",,"[General News, Business]"
4,times_of_india,Why are Pakistan players avoiding Indian media?,https://timesofindia.indiatimes.com/sports/cri...,"Mon, 22 Sep 2025 16:59:46 +0530",Pakistan cricket faces criticism for on-field ...,"[General News, Business]"


In [17]:
from newspaper import Article
import time

def extract_article_content(url):
    """
    Extract article content using newspaper3k
    
    Args:
        url (str): URL of the article
        
    Returns:
        dict: Dictionary containing article details
    """
    try:
        # Add a small delay to be respectful to the servers
        time.sleep(1)
        
        article = Article(url)
        article.download()
        article.parse()
        
        return {
            'full_text': article.text,
            'authors': article.authors,
            'top_image': article.top_image,
            'article_date': article.publish_date
        }
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")
        return {
            'full_text': '',
            'authors': [],
            'top_image': '',
            'article_date': None
        }

# Test Article Extraction
Let's try our enhanced news fetching with article content extraction. We'll start with a small sample to make sure everything works correctly.

In [18]:
# Test with a few articles from one source
source_name = "techcrunch"  # TechCrunch tends to have more consistent article structure
news_items = fetch_rss_feed(source_name, SOURCES[source_name])

# Take first 3 articles for testing
sample_news = news_items[:3]

# Add article content
for item in sample_news:
    article_content = extract_article_content(item['link'])
    item.update(article_content)

# Convert to DataFrame
df_with_content = pd.DataFrame(sample_news)

# Display results
print("Number of articles processed:", len(df_with_content))
print("\nColumns available:", df_with_content.columns.tolist())
print("\nSample article details:")
for idx, row in df_with_content.iterrows():
    print(f"\nArticle {idx + 1}:")
    print(f"Title: {row['title']}")
    print(f"Authors: {row['authors']}")
    print(f"Text length: {len(row['full_text'])} characters")
    print("-" * 50)

Number of articles processed: 3

Columns available: ['source', 'title', 'link', 'published', 'summary', 'categories', 'full_text', 'authors', 'top_image', 'article_date']

Sample article details:

Article 1:
Title: Commonwealth Fusion Systems books a $1B+ power deal for its future fusion reactor
Authors: ['Tim De Chant', 'Senior Reporter', 'Karyne Levy', 'Connie Loizos', 'Sarah Perez', 'Julie Bort', 'Maxwell Zeff', 'Ivan Mehta', '--C-Author-Card-Image-Size Align-Items Center Display Flex Gap Var', 'Media']
Text length: 4535 characters
--------------------------------------------------

Article 2:
Title: Powered by India’s small businesses, UK fintech Tide becomes a TPG-backed unicorn
Authors: ['Jagmeet Singh', 'Karyne Levy', 'Connie Loizos', 'Sarah Perez', 'Julie Bort', 'Maxwell Zeff', 'Ivan Mehta', '--C-Author-Card-Image-Size Align-Items Center Display Flex Gap Var', 'Media', 'Min-Width']
Text length: 4837 characters
--------------------------------------------------

Article 3:
Title

In [19]:
df_with_content.head()

Unnamed: 0,source,title,link,published,summary,categories,full_text,authors,top_image,article_date
0,techcrunch,Commonwealth Fusion Systems books a $1B+ power...,https://techcrunch.com/2025/09/22/commonwealth...,"Mon, 22 Sep 2025 11:00:00 +0000",The fusion startup has inked a deal with Itali...,"[Technology, Startups]",Commonwealth Fusion Systems has agreed to sell...,"[Tim De Chant, Senior Reporter, Karyne Levy, C...",https://techcrunch.com/wp-content/uploads/2025...,2025-09-22
1,techcrunch,"Powered by India’s small businesses, UK fintec...",https://techcrunch.com/2025/09/21/powered-by-i...,"Mon, 22 Sep 2025 06:00:00 +0000",Tide serves over 1.6 million micro and small e...,"[Technology, Startups]",U.K.-based fintech Tide has entered the unicor...,"[Jagmeet Singh, Karyne Levy, Connie Loizos, Sa...",https://techcrunch.com/wp-content/uploads/2025...,2025-09-21
2,techcrunch,"VCs are still hiring MBAs, but firms are start...",https://techcrunch.com/2025/09/21/vcs-are-stil...,"Sun, 21 Sep 2025 22:23:40 +0000",The MBA-to-VC pipeline remains a very real thi...,"[Technology, Startups]",In Brief\n\nThe MBA-to-VC pipeline remains a v...,"[Connie Loizos, Techcrunch Events, Dominic-Mad...",https://techcrunch.com/wp-content/uploads/2015...,2025-09-21


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_duplicates(articles_df, text_column='full_text', title_column='title', 
                   similarity_threshold=0.85):
    """
    Find duplicate articles using TF-IDF and cosine similarity
    
    Args:
        articles_df (pd.DataFrame): DataFrame containing articles
        text_column (str): Name of the column containing article text
        title_column (str): Name of the column containing article titles
        similarity_threshold (float): Threshold for considering articles as duplicates
        
    Returns:
        pd.DataFrame: DataFrame with duplicate information
    """
    # Create TF-IDF vectors for the articles
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    
    # Combine title and text with more weight on title
    combined_text = articles_df[title_column].str.lower() + " " + \
                   articles_df[title_column].str.lower() + " " + \
                   articles_df[text_column].str.lower()
    
    # Get TF-IDF matrix
    tfidf_matrix = tfidf.fit_transform(combined_text)
    
    # Calculate cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix)
    
    # Find duplicates
    duplicates = []
    for i in range(len(articles_df)):
        for j in range(i + 1, len(articles_df)):
            if cosine_sim[i][j] > similarity_threshold:
                duplicates.append({
                    'article1_idx': i,
                    'article2_idx': j,
                    'similarity_score': cosine_sim[i][j],
                    'article1_title': articles_df.iloc[i][title_column],
                    'article2_title': articles_df.iloc[j][title_column]
                })
    
    return pd.DataFrame(duplicates)

# Batch Summarization and Meta-Summary Generation
Let's add functionality to:
1. Generate summaries for all articles
2. Create a meta-summary combining key points from all articles

In [None]:
def process_article_summaries(articles_df):
    """
    Generate summaries for all articles in the DataFrame
    
    Args:
        articles_df (pd.DataFrame): DataFrame containing articles with 'full_text' column
        
    Returns:
        pd.DataFrame: DataFrame with added 'summary' column
    """
    print("Generating summaries for all articles...")
    
    # Create a copy to avoid modifying the original
    df = articles_df.copy()
    
    # Generate summaries
    summaries = []
    for idx, row in df.iterrows():
        print(f"Processing article {idx + 1}/{len(df)}")
        summary = summarize_article(row['full_text'])
        summaries.append(summary)
    
    # Add summaries to DataFrame
    df['summary'] = summaries
    
    return df

def generate_meta_summary(articles_df, summary_column='summary'):
    """
    Generate a meta-summary of all article summaries
    
    Args:
        articles_df (pd.DataFrame): DataFrame containing articles with summaries
        summary_column (str): Name of the column containing summaries
        
    Returns:
        str: Meta-summary text
    """
    try:
        # Combine all summaries
        all_summaries = "\n\n".join(articles_df[summary_column].tolist())
        
        # Create prompt for meta-summary
        prompt = f"""Below are summaries of multiple news articles. 
        Please create a comprehensive meta-summary that:
        1. Identifies major themes and trends
        2. Highlights key developments across articles
        3. Notes any contrasting viewpoints or developments
        4. Provides a high-level overview of the news landscape

        Article Summaries:
        {all_summaries}

        Meta-Summary:"""
        
        response = client.chat.completions.create(
            model="x-ai/grok-4-fast:free",
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens=500  # Longer for meta-summary
        )
        
        return response.choices[0].message.content.strip()
    
    except Exception as e:
        print(f"Error generating meta-summary: {str(e)}")
        return "Error generating meta-summary"

In [None]:
# Test batch summarization and meta-summary
if len(df_all) > 0:
    # Process all articles to get summaries
    df_with_summaries = process_article_summaries(df_all)
    
    # Display some sample summaries
    print("\nSample Article Summaries:")
    for idx, row in df_with_summaries.head(2).iterrows():
        print(f"\nArticle {idx + 1}:")
        print(f"Title: {row['title']}")
        print(f"Summary length: {len(row['summary'])} characters")
        print("-" * 80)
        print(row['summary'])
        print("-" * 80)
    
    # Generate and display meta-summary
    print("\nGenerating meta-summary...")
    meta_summary = generate_meta_summary(df_with_summaries)
    
    print("\nMeta-Summary of All Articles:")
    print("-" * 80)
    print(meta_summary)
    print("-" * 80)
    
    # Add summaries to our main DataFrame
    df_all = df_with_summaries

# Test Deduplication
Let's test our deduplication function with articles from multiple sources. We'll:
1. Fetch articles from different sources
2. Extract their content
3. Run the deduplication algorithm
4. Analyze the results

In [21]:
# Fetch articles from multiple sources
all_articles = []

# Get articles from each source
for source_name, source_info in SOURCES.items():
    print(f"Fetching articles from {source_name}...")
    
    # Get RSS feed items
    news_items = fetch_rss_feed(source_name, source_info)
    
    # Take first 5 articles from each source
    for item in news_items[:5]:
        # Extract full content
        article_content = extract_article_content(item['link'])
        item.update(article_content)
        all_articles.append(item)

# Create DataFrame with all articles
df_all = pd.DataFrame(all_articles)

# Find duplicates
print("\nChecking for duplicates...")
duplicates_df = find_duplicates(df_all)

# Display results
print(f"\nFound {len(duplicates_df)} potential duplicate pairs:")
if not duplicates_df.empty:
    for _, row in duplicates_df.iterrows():
        print(f"\nSimilarity Score: {row['similarity_score']:.3f}")
        print(f"Article 1: {row['article1_title']}")
        print(f"Article 2: {row['article2_title']}")
        print("-" * 80)

Fetching articles from economic_times...
Fetching articles from times_of_india...
Fetching articles from times_of_india...
Fetching articles from techcrunch...
Fetching articles from techcrunch...

Checking for duplicates...

Found 0 potential duplicate pairs:

Checking for duplicates...

Found 0 potential duplicate pairs:


In [24]:
# Test summarization with a sample article
if len(df_with_content) > 0:
    # Take the first article as a test
    sample_article = df_with_content.iloc[0]
    
    print("Original Article:")
    print("Title:", sample_article['title'])
    print("Length:", len(sample_article['full_text']), "characters")
    print("-" * 80)
    print(sample_article['full_text'][:500], "...\n")  # Show first 500 characters
    
    # Generate summary
    print("\nGenerating summary...")
    summary = summarize_article(sample_article['full_text'])
    
    print("\nSummary:")
    print("-" * 80)
    print(summary)
    print("\nSummary length:", len(summary), "characters")

Original Article:
Title: Commonwealth Fusion Systems books a $1B+ power deal for its future fusion reactor
Length: 4535 characters
--------------------------------------------------------------------------------
Commonwealth Fusion Systems has agreed to sell Italian energy company Eni more than $1 billion worth of power from its first fusion reactor.

The power plant will be built outside of Richmond, Virginia, close to some of the highest densities of data centers in the country. The 400-megawatt fusion reactor, called Arc, is expected to open in the early 2030s, CEO Bob Mumgaard said.

The Eni agreement is the second such deal for Commonwealth Fusion Systems (CFS). In June, Google said that it would b ...


Generating summary...

Summary:
--------------------------------------------------------------------------------
### Summary of the Article

Commonwealth Fusion Systems (CFS), a leading fusion energy company, has signed a deal to sell Italian energy giant Eni over $1 billion worth