In [15]:
# Generated Dataset Link : https://raw.githubusercontent.com/a-anuj/fods-case-study/refs/heads/main/webscraping_dataset.csv

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

rss_feeds = {
    # Times of India
    "TOI India": "https://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms",
    "TOI World": "https://timesofindia.indiatimes.com/rssfeeds/296589292.cms",
    "TOI Business": "https://timesofindia.indiatimes.com/rssfeeds/1898055.cms",
    "TOI Sports": "https://timesofindia.indiatimes.com/rssfeeds/4719148.cms",

    # The Hindu
    "Hindu National": "https://www.thehindu.com/news/national/feeder/default.rss",
    "Hindu International": "https://www.thehindu.com/news/international/feeder/default.rss",
    "Hindu Business": "https://www.thehindu.com/business/feeder/default.rss",
    "Hindu SciTech": "https://www.thehindu.com/sci-tech/feeder/default.rss",

    # BBC News
    "BBC India": "http://feeds.bbci.co.uk/news/world/asia/india/rss.xml",
    "BBC Asia": "http://feeds.bbci.co.uk/news/world/asia/rss.xml",
    "BBC Business": "http://feeds.bbci.co.uk/news/business/rss.xml",
    "BBC Tech": "http://feeds.bbci.co.uk/news/technology/rss.xml"
}

news_data = []
sno = 1

for site, url in rss_feeds.items():
    print(f"Scraping {site}...")
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, features="xml")
        items = soup.find_all("item")
        for item in items:
            title = item.title.text.strip()
            link = item.link.text.strip()
            category = site.split()[1] if len(site.split()) > 1 else site
            pub_date = item.pubDate.text.strip() if item.pubDate else None
            # Convert to datetime if exists
            if pub_date:
                try:
                    pub_date = datetime.strptime(pub_date, "%a, %d %b %Y %H:%M:%S %Z")
                except:
                    pass
            news_data.append([sno, title, site.split()[0], link, category, pub_date])
            sno += 1

news_df = pd.DataFrame(news_data, columns=["S.No", "Headline", "Source", "Link", "Category", "Timestamp"])
news_df.to_csv("webscraping_dataset.csv", index=False, encoding="utf-8-sig")
print(f"News scraping done. Collected {len(news_df)} articles.")



Scraping TOI India...
Scraping TOI World...
Scraping TOI Business...
Scraping TOI Sports...
Scraping Hindu National...
Scraping Hindu International...
Scraping Hindu Business...
Scraping Hindu SciTech...
Scraping BBC India...
Scraping BBC Asia...
Scraping BBC Business...
Scraping BBC Tech...
News scraping done. Collected 637 articles.


In [16]:
news_df.head()

Unnamed: 0,S.No,Headline,Source,Link,Category,Timestamp
0,1,'If Operation Sindoor was ongoing ... ': Congr...,TOI,https://timesofindia.indiatimes.com/india/if-o...,India,2025-09-29T10:30:49+05:30
1,2,Karur stampede: Rahul Gandhi dials Vijay; offe...,TOI,https://timesofindia.indiatimes.com/india/karu...,India,2025-09-29T10:24:38+05:30
2,3,HIV jab for 115 nations rests on Indian regula...,TOI,https://timesofindia.indiatimes.com/india/hiv-...,India,2025-09-29T05:17:52+05:30
3,4,Illegal betting: ED may attach assets of celebs,TOI,https://timesofindia.indiatimes.com/india/ille...,India,2025-09-29T04:19:59+05:30
4,5,"Cancer cases in India up by 26%, deaths 21% si...",TOI,https://timesofindia.indiatimes.com/india/canc...,India,2025-09-29T04:12:26+05:30


In [17]:
print("Number of rows : ",news_df.shape[0])
print("Number of features : ",news_df.shape[1])

Number of rows :  637
Number of features :  6
