# News Feed Reader
Source: https://timesofindia.indiatimes.com/rss.cms

In [14]:
#Imports
import feedparser
import requests
from bs4 import BeautifulSoup
import hashlib
from datetime import date, datetime

import os
import shutil
import re
import time 
import html

## Read Selected News feeds and extract detail content

Below is the list of news feed avaialble in Times Of India. Some of the topic has been disabled to keep the news content size under controll and help train SLM faster

In [15]:
rss_feed_base_url = "https://timesofindia.indiatimes.com/"
rss_feed_urls = {
    "India":f"{rss_feed_base_url}/rssfeeds/-2128936835.cms",   # India
    "World":f"{rss_feed_base_url}/rssfeeds/296589292.cms",     # World
    #"NRI":f"{rss_feed_base_url}/rssfeeds/7098551.cms",       # NRI
    #"Business":f"{rss_feed_base_url}/rssfeeds/7098551.cms",       # Business
    #"US":f"{rss_feed_base_url}/rssfeeds_us/72258322.cms",   # US
    #"Crikcet":f"{rss_feed_base_url}/rssfeeds/54829575.cms",      # Cricket
    #"Sports":f"{rss_feed_base_url}/rssfeeds/4719148.cms",       # Sports
    #"Science":f"{rss_feed_base_url}/rssfeeds/-2128672765.cms",   # Scrience
    "Environment":f"{rss_feed_base_url}/rssfeeds/2647163.cms",       # Environment
    "Tech":f"{rss_feed_base_url}/rssfeeds/66949542.cms",      # Tech
    "Education":f"{rss_feed_base_url}/rssfeeds/913168846.cms",      # Education
    #"Entertainment":f"{rss_feed_base_url}/rssfeeds/1081479906.cms",     # Entertainment
    #"Life & style":f"{rss_feed_base_url}/rssfeeds/2886704.cms",        # Life & style
    #"Auto":f"{rss_feed_base_url}/rssfeeds/2886704.cms",        # Auto

}

In [16]:

# Function to fetch and parse RSS feed
def fetch_rss_feed(feed_url):
    feed = feedparser.parse(feed_url)
    return feed.entries

# Function to download content from a URL
def download_content(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        # Extract main content (this may vary depending on the website structure)
        return soup.get_text()
    except Exception as e:
        print(f"Failed to fetch content from {url}: {e}")
        return None
    
# Function to create news content folders
def get_or_create_news_collecion_folder(topic="unk", base_folder_path="../data/news_content"):

    today_date = date.today()
    today_date_string = today_date.strftime("%Y%m%d")
    todays_news_folder =  f"{base_folder_path}/{today_date_string}"
    print(todays_news_folder)
    
    todays_news_file    =   f"{base_folder_path}/{today_date_string}/_{today_date_string}_all_news.txt"

    todays_news_topic_file    =   f"{base_folder_path}/{today_date_string}/_{today_date_string}_{topic}_news.txt"

    if not os.path.exists(todays_news_folder):
        os.makedirs(todays_news_folder)
        open(f"{todays_news_file}", "w").close()

    if not os.path.exists(todays_news_topic_file):
        open(f"{todays_news_topic_file}", "w").close()

    return todays_news_folder,todays_news_topic_file, todays_news_file


In [17]:
import sys
import json

def read_feed_and_download_content(topic, rss_url, rss_feed_count):
    news_items = fetch_rss_feed(rss_url)
    
    todays_news_folder, todays_news_topic_file, todays_news_file = get_or_create_news_collecion_folder(topic)

    soa_eoa_match = {}
    mcq_generation_instructions =[]
    for item in news_items:
        title = item.get("title", "No Title")
        
        summary = html.unescape(item.get("description", "No Description"))
        soup = BeautifulSoup(summary, 'html.parser')
        summary = soup.get_text(summary)

        link = item.get("link", None)
        print(f"Fetching: {title} => {link}")
        print(f"Description/Summary=>{summary}")
        if link:
            content = download_content(link)
            if content:

                try:
                    soa = " ".join(summary.split()[:10]) if summary else " ".join(title.split()[:10])
                    print("SOA=>",soa)
                    #soa_match = re.search(soa, content)
                    soa_matches = list(re.finditer(soa, content))
                    print("###SOA Matches:",len(soa_matches),":",soa_matches)
                    soa_match = soa_matches[len(soa_matches)-1]
                    if soa_match: content = content[soa_match.start():]
                    soa_eoa_match[f"{title}_eoa"]=soa_match
                except Exception as e:
                    print(e)
                    print("$$$$ Not able to process SOA for ", soa)
                    print("=== Ignoring the news content as of now ===")
                    continue
                    
                eoa = "End of Article"
                eoa_match = re.search(eoa, content)
                if eoa_match: content = content[:eoa_match.start()]
                soa_eoa_match[f"{title}_eoa"]=eoa_match


                # Save content to a file (optional)
                short_title = title[:50].replace(' ', '_')
                short_title = re.sub(r'[^A-Za-z0-9 _-]', "_", short_title).strip("_")
                epoch_sec =  int(time.time())
                filename = f"{todays_news_folder}/{topic}_{short_title}.txt"

                # Write news content to individual file
                with open(filename, "w", encoding="utf-8") as file:
                    file.write(f"{link}\n\n")
                    file.write(f"{title}\n\n")
                    file.write(f"{summary}\n\n")
                    file.write(f"{content}\n\n")

                # Aggregate news content topicwise
                with open(todays_news_topic_file, "a", encoding="utf-8") as file2:
                    file2.write(f"{title}\n\n")
                    file2.write(f"{summary}\n\n")
                    file2.write(f"{content}\n\n")

                # Aggregate news content in single file
                with open(todays_news_file, "a", encoding="utf-8") as file3:
                    file3.write(f"{title}\n\n")
                    file3.write(f"{summary}\n\n")
                    file3.write(f"{content}\n\n")

                print(f"Saved: {filename}")
                rss_feed_count[0] = rss_feed_count[0] + 1

                mcq_generation_instruction = {
                    "link":f"{link}",
                    "topic":f"{topic}",
                    "title":f"{title}",
                    "instruction":f"Generate a Multiple Choice Question with answer on the below {topic} input context.",
                    "input":f"{content}",
                    "output":"Question: \nOptions: \n[A]  \n[B]  \n[C] \n[D] \nCorrect Answer: "

                }
                mcq_generation_instructions.append(mcq_generation_instruction)
        else:
            print("No link found for this item.")

    return mcq_generation_instructions



In [18]:
rss_feed_urls_count = 0
rss_feed_count = [0]

mcq_generation_instructions = []
for topic, rss_feed_url in rss_feed_urls.items():
    print(topic,"=>",rss_feed_url)
    mcq_generation_instructions_= read_feed_and_download_content(topic,rss_feed_url, rss_feed_count)
    mcq_generation_instructions.extend(mcq_generation_instructions_)
    rss_feed_urls_count=rss_feed_urls_count+1

base_folder_path = "../data/news_content"
today_date = date.today()
today_date_string = today_date.strftime("%Y%m%d")
todays_news_folder =  f"{base_folder_path}/{today_date_string}"
print(todays_news_folder)

mcq_generation_instructions_file = f"{todays_news_folder}/__{today_date_string}_ca_mcqs_generation_instructions.json"
with open(mcq_generation_instructions_file, "a", encoding="utf-8") as f:
    json.dump(mcq_generation_instructions, f, indent=4)  # indent=4 makes it pretty

India => https://timesofindia.indiatimes.com//rssfeeds/-2128936835.cms
../data/news_content/20250823
Fetching: J&K: Hizbul Mujahideen sleeper cell busted; OGW arrested, linked to cross-border terror handler => https://timesofindia.indiatimes.com/india/jk-hizb-ul-mujahideen-sleeper-cell-busted-ogw-arrested-linked-to-cross-border-terror-handler/articleshow/123470196.cms
Description/Summary=>Jammu and Kashmir Police's SIA made a significant breakthrough by apprehending Altaf Hussain Wagay, an over-ground worker linked to Hizbul Mujahideen. Wagay, a resident of Shopian, was operating as a sleeper cell, facilitating terror activities in the region. Investigations revealed his direct links with a Hizb handler operating from across the border, orchestrating anti-national activities.
SOA=> Jammu and Kashmir Police's SIA made a significant breakthrough by
###SOA Matches: 1 : [<re.Match object; span=(1381, 1446), match="Jammu and Kashmir Police's SIA made a significant>]
Saved: ../data/news_cont

In [19]:
print(f"From {rss_feed_base_url} Successfull downloaded {rss_feed_count} news feeds from {rss_feed_urls_count} URLs.")

From https://timesofindia.indiatimes.com/ Successfull downloaded [74] news feeds from 5 URLs.
