In [7]:
import feedparser, os, re
import sqlite3

In [2]:
RSS_FEEDS = [
    "https://www.marketbeat.com/feed/", 
    "https://economictimes.indiatimes.com/markets/rssfeeds/1977021501.cms", 
    "https://www.etnownews.com/feeds/gns-etn-companies.xml", 
    "https://www.cnbctv18.com/commonfeeds/v1/cne/rss/business.xml", 
    #"https://www.financeasia.com/rss/latest"
]

In [3]:
parsed = feedparser.parse(RSS_FEEDS[2])
parsed.entries[0]

{'id': '153220552',
 'guidislink': False,
 'published': 'Sat, 29 Nov 2025 08:58:56 +0530',
 'published_parsed': time.struct_time(tm_year=2025, tm_mon=11, tm_mday=29, tm_hour=3, tm_min=28, tm_sec=56, tm_wday=5, tm_yday=333, tm_isdst=0),
 'title': 'Bandhan Bank NPA sale of Rs 7000 crore approved by board',
 'title_detail': {'type': 'text/plain',
  'language': None,
  'base': 'https://www.etnownews.com/feeds/gns-etn-companies.xml',
  'value': 'Bandhan Bank NPA sale of Rs 7000 crore approved by board'},
 'summary': "Bandhan Bank NPA sale: Bandhan Bank plans to sell NPAs and written-off loans worth Rs 6,931.31 crore using the Swiss Challenge and auctions. Share's long-term returns remain poor.",
 'summary_detail': {'type': 'text/html',
  'language': None,
  'base': 'https://www.etnownews.com/feeds/gns-etn-companies.xml',
  'value': "Bandhan Bank NPA sale: Bandhan Bank plans to sell NPAs and written-off loans worth Rs 6,931.31 crore using the Swiss Challenge and auctions. Share's long-term r

In [4]:
def fetch_rss_feeds():
    articles = []
    for feed in RSS_FEEDS:
        parsed = feedparser.parse(feed)
        for entry in parsed.entries:
            articles.append({
                "source": feed,
                "url": entry.link,
                "title": entry.title,
                "content": entry.summary,
                "published_at": getattr(entry, "published", None)
            })
    print("Fetching News was successful!")
    return articles

articles = fetch_rss_feeds()
print(f"---\nTotal articles fetched: {len(articles)}")
articles[23]

Fetching News was successful!
---
Total articles fetched: 400


{'source': 'https://www.marketbeat.com/feed/',
 'url': 'https://www.marketbeat.com/originals/from-science-project-to-solvent-werides-761-revenue-surge/',
 'title': 'From Science Project to Solvent: WeRide’s 761% Revenue Surge',
 'content': 'WeRide stock rallied significantly as the company reported a massive surge in robotaxi revenue and successfully expanded its operations in the Middle East.',
 'published_at': 'Tue, 25 Nov 2025 16:48:00 GMT'}

In [11]:
def clean_text(text):
    text = re.sub(r"\s+", " ", text)
    text = text.strip()
    return text

def standardize_article(a):
    return {
        "source": a["source"],
        "url": a["url"],
        "title": clean_text(a["title"]),
        "content": clean_text(a["content"]),
        "published_at": a["published_at"]
    }

standardized_articles = [standardize_article(a) for a in articles]
print("All articles have been standardized")
standardized_articles[23]

All articles have been standardized


{'source': 'https://www.marketbeat.com/feed/',
 'url': 'https://www.marketbeat.com/originals/from-science-project-to-solvent-werides-761-revenue-surge/',
 'title': 'From Science Project to Solvent: WeRide’s 761% Revenue Surge',
 'content': 'WeRide stock rallied significantly as the company reported a massive surge in robotaxi revenue and successfully expanded its operations in the Middle East.',
 'published_at': 'Tue, 25 Nov 2025 16:48:00 GMT'}

In [12]:
os.makedirs("../data", exist_ok=True)

conn = sqlite3.connect("../data/financial_news.db")
print("Connected!")

cur = conn.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS raw_news (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            source TEXT,
            url TEXT,
            title TEXT,
            content TEXT,
            published_at TEXT
            )
""")

conn.commit()
cur.close()
conn.close()

Connected!


In [13]:
def save_articles_to_db(article):
    conn = sqlite3.connect("../data/financial_news.db")
    cur = conn.cursor()

    cur.execute("""
        INSERT INTO raw_news (source, url, title, content, published_at)
        VALUES ( ?, ?, ?, ?, ?)
        """, (
            article["source"],
            article["url"],
            article["title"],
            article["content"],
            article["published_at"]
        ))
    
    conn.commit()
    cur.close()
    conn.close()

for article in standardized_articles:
    save_articles_to_db(article)

print("All articles saved to db.")

All articles saved to db.
