In [1]:
SOURCES = {
    "economic_times": {
        "rss": "https://economictimes.indiatimes.com/rssfeedstopstories.cms",
        "categories": ["Business", "Technology"]
    },
    "times_of_india": {
        "rss": "https://timesofindia.indiatimes.com/rssfeedstopstories.cms",
        "categories": ["General News", "Business"] 
    },
    "techcrunch": {
        "rss": "https://techcrunch.com/feed/",
        "categories": ["Technology", "Startups"]
    }
}


In [2]:
import feedparser
from datetime import datetime
import pandas as pd

In [3]:
def fetch_rss_feed(source_name, source_info):
    """
    Fetch and parse RSS feed from a given source
    
    Args:
        source_name (str): Name of the source
        source_info (dict): Dictionary containing RSS feed URL and categories
        
    Returns:
        list: List of dictionaries containing parsed news items
    """
    feed = feedparser.parse(source_info['rss'])
    
    news_items = []
    for entry in feed.entries:
        news_item = {
            'source': source_name,
            'title': entry.get('title', ''),
            'link': entry.get('link', ''),
            'published': entry.get('published', ''),
            'summary': entry.get('summary', ''),
            'categories': source_info['categories']
        }
        news_items.append(news_item)
    
    return news_items

In [4]:
# Let's test with one source first
source_name =  "times_of_india" #"economic_times" #"techcrunch"
news_items = fetch_rss_feed(source_name, SOURCES[source_name])

# Convert to DataFrame for better visualization
df = pd.DataFrame(news_items)
df.head()

Unnamed: 0,source,title,link,published,summary,categories
0,times_of_india,Air India crash: SC questions preliminary repo...,https://timesofindia.indiatimes.com/india/air-...,"Mon, 22 Sep 2025 12:21:21 +0530",The Supreme Court has expressed serious concer...,"[General News, Business]"
1,times_of_india,'Family or career?': H-1B workers face difficu...,https://timesofindia.indiatimes.com/nri/us-can...,"Mon, 22 Sep 2025 09:42:42 +0530",,"[General News, Business]"
2,times_of_india,"UP man strangles girlfriend, dumps body in Yam...",https://timesofindia.indiatimes.com/city/kanpu...,"Mon, 22 Sep 2025 10:14:28 +0530","Kanpur police have apprehended two men, Suraj ...","[General News, Business]"
3,times_of_india,BJP shares video of Tirumala temple 'loot'; cl...,https://timesofindia.indiatimes.com/india/inve...,"Mon, 22 Sep 2025 12:25:55 +0530",,"[General News, Business]"
4,times_of_india,Asia Cup scenarios: How Pak can still qualify ...,https://timesofindia.indiatimes.com/sports/cri...,"Mon, 22 Sep 2025 12:23:19 +0530",India and Bangladesh have commenced their Supe...,"[General News, Business]"


In [5]:
from newspaper import Article
import time

def extract_article_content(url):
    """
    Extract article content using newspaper3k
    
    Args:
        url (str): URL of the article
        
    Returns:
        dict: Dictionary containing article details
    """
    try:
        # Add a small delay to be respectful to the servers
        time.sleep(1)
        
        article = Article(url)
        article.download()
        article.parse()
        
        return {
            'full_text': article.text,
            'authors': article.authors,
            'top_image': article.top_image,
            'article_date': article.publish_date
        }
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")
        return {
            'full_text': '',
            'authors': [],
            'top_image': '',
            'article_date': None
        }

  _digits = re.compile('\d')
  search_str = re.sub('[bB][yY][\:\s]|[fF]rom[\:\s]', '', search_str)
  name_tokens = re.split("[^\w\'\-\.]", search_str)
  kwargs = {'attr': 'type', 'value': 'application\/rss\+xml'}
  'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|'


# Test Article Extraction
Let's try our enhanced news fetching with article content extraction. We'll start with a small sample to make sure everything works correctly.

In [6]:
# Test with a few articles from one source
source_name = "techcrunch"  # TechCrunch tends to have more consistent article structure
news_items = fetch_rss_feed(source_name, SOURCES[source_name])

# Take first 3 articles for testing
sample_news = news_items[:3]

# Add article content
for item in sample_news:
    article_content = extract_article_content(item['link'])
    item.update(article_content)

# Convert to DataFrame
df_with_content = pd.DataFrame(sample_news)

# Display results
print("Number of articles processed:", len(df_with_content))
print("\nColumns available:", df_with_content.columns.tolist())
print("\nSample article details:")
for idx, row in df_with_content.iterrows():
    print(f"\nArticle {idx + 1}:")
    print(f"Title: {row['title']}")
    print(f"Authors: {row['authors']}")
    print(f"Text length: {len(row['full_text'])} characters")
    print("-" * 50)

Number of articles processed: 3

Columns available: ['source', 'title', 'link', 'published', 'summary', 'categories', 'full_text', 'authors', 'top_image', 'article_date']

Sample article details:

Article 1:
Title: Powered by India’s small businesses, UK fintech Tide becomes a TPG-backed unicorn
Authors: ['Jagmeet Singh', 'Connie Loizos', 'Sarah Perez', 'Julie Bort', 'Maxwell Zeff', 'Ivan Mehta', '--C-Author-Card-Image-Size Align-Items Center Display Flex Gap Var', 'Media', 'Min-Width', '--C-Author-Card-Image-Size']
Text length: 4837 characters
--------------------------------------------------

Article 2:
Title: VCs are still hiring MBAs, but firms are starting to need other experience more
Authors: ['Connie Loizos', 'Techcrunch Events', 'Dominic-Madori Davis', '.Post-Authors-List__Authors --Font-Size Var', 'Align-Items Center Display Flex Gap Var', '.Post-Authors-List__Authors .Post-Authors-List__Author-Thumbs Display Flex Flex-Shrink Margin Padding .Post-Authors-List__Authors .Post-

In [7]:
df_with_content.head()

Unnamed: 0,source,title,link,published,summary,categories,full_text,authors,top_image,article_date
0,techcrunch,"Powered by India’s small businesses, UK fintec...",https://techcrunch.com/2025/09/21/powered-by-i...,"Mon, 22 Sep 2025 06:00:00 +0000",Tide serves over 1.6 million micro and small e...,"[Technology, Startups]",U.K.-based fintech Tide has entered the unicor...,"[Jagmeet Singh, Connie Loizos, Sarah Perez, Ju...",https://techcrunch.com/wp-content/uploads/2025...,2025-09-21
1,techcrunch,"VCs are still hiring MBAs, but firms are start...",https://techcrunch.com/2025/09/21/vcs-are-stil...,"Sun, 21 Sep 2025 22:23:40 +0000",The MBA-to-VC pipeline remains a very real thi...,"[Technology, Startups]",In Brief\n\nThe MBA-to-VC pipeline remains a v...,"[Connie Loizos, Techcrunch Events, Dominic-Mad...",https://techcrunch.com/wp-content/uploads/2015...,2025-09-21
2,techcrunch,Trump says Lachlan and Rupert Murdoch might in...,https://techcrunch.com/2025/09/21/white-house-...,"Sun, 21 Sep 2025 19:43:34 +0000",The Trump administration has been talking up a...,"[Technology, Startups]",The Trump administration has been talking up a...,"[Anthony Ha, Connie Loizos, Sarah Perez, Julie...",https://techcrunch.com/wp-content/uploads/2025...,2025-09-21


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_duplicates(articles_df, text_column='full_text', title_column='title', 
                   similarity_threshold=0.85):
    """
    Find duplicate articles using TF-IDF and cosine similarity
    
    Args:
        articles_df (pd.DataFrame): DataFrame containing articles
        text_column (str): Name of the column containing article text
        title_column (str): Name of the column containing article titles
        similarity_threshold (float): Threshold for considering articles as duplicates
        
    Returns:
        pd.DataFrame: DataFrame with duplicate information
    """
    # Create TF-IDF vectors for the articles
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    
    # Combine title and text with more weight on title
    combined_text = articles_df[title_column].str.lower() + " " + \
                   articles_df[title_column].str.lower() + " " + \
                   articles_df[text_column].str.lower()
    
    # Get TF-IDF matrix
    tfidf_matrix = tfidf.fit_transform(combined_text)
    
    # Calculate cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix)
    
    # Find duplicates
    duplicates = []
    for i in range(len(articles_df)):
        for j in range(i + 1, len(articles_df)):
            if cosine_sim[i][j] > similarity_threshold:
                duplicates.append({
                    'article1_idx': i,
                    'article2_idx': j,
                    'similarity_score': cosine_sim[i][j],
                    'article1_title': articles_df.iloc[i][title_column],
                    'article2_title': articles_df.iloc[j][title_column]
                })
    
    return pd.DataFrame(duplicates)

# Test Deduplication
Let's test our deduplication function with articles from multiple sources. We'll:
1. Fetch articles from different sources
2. Extract their content
3. Run the deduplication algorithm
4. Analyze the results

In [9]:
# Fetch articles from multiple sources
all_articles = []

# Get articles from each source
for source_name, source_info in SOURCES.items():
    print(f"Fetching articles from {source_name}...")
    
    # Get RSS feed items
    news_items = fetch_rss_feed(source_name, source_info)
    
    # Take first 5 articles from each source
    for item in news_items[:5]:
        # Extract full content
        article_content = extract_article_content(item['link'])
        item.update(article_content)
        all_articles.append(item)

# Create DataFrame with all articles
df_all = pd.DataFrame(all_articles)

# Find duplicates
print("\nChecking for duplicates...")
duplicates_df = find_duplicates(df_all)

# Display results
print(f"\nFound {len(duplicates_df)} potential duplicate pairs:")
if not duplicates_df.empty:
    for _, row in duplicates_df.iterrows():
        print(f"\nSimilarity Score: {row['similarity_score']:.3f}")
        print(f"Article 1: {row['article1_title']}")
        print(f"Article 2: {row['article2_title']}")
        print("-" * 80)

Fetching articles from economic_times...
Fetching articles from times_of_india...
Fetching articles from times_of_india...
Fetching articles from techcrunch...
Fetching articles from techcrunch...

Checking for duplicates...

Found 0 potential duplicate pairs:

Checking for duplicates...

Found 0 potential duplicate pairs:
