In [1]:
import feedparser, os, re
import sqlite3

In [2]:
RSS_FEEDS = [
    "https://www.marketbeat.com/feed/", 
    "https://economictimes.indiatimes.com/markets/rssfeeds/1977021501.cms", 
    "https://www.etnownews.com/feeds/gns-etn-companies.xml", 
    "https://www.cnbctv18.com/commonfeeds/v1/cne/rss/business.xml", 
    #"https://www.financeasia.com/rss/latest"
]

In [3]:
parsed = feedparser.parse(RSS_FEEDS[2])
parsed.entries[0]

{'id': '153225966',
 'guidislink': False,
 'published': 'Sun, 30 Nov 2025 14:46:04 +0530',
 'published_parsed': time.struct_time(tm_year=2025, tm_mon=11, tm_mday=30, tm_hour=9, tm_min=16, tm_sec=4, tm_wday=6, tm_yday=334, tm_isdst=0),
 'title': 'Big legal win for THIS Tata Group company! NCLAT rejects insolvency plea, upholds NCLT order – Details',
 'title_detail': {'type': 'text/plain',
  'language': None,
  'base': 'https://www.etnownews.com/feeds/gns-etn-companies.xml',
  'value': 'Big legal win for THIS Tata Group company! NCLAT rejects insolvency plea, upholds NCLT order – Details'},
 'summary': 'NCLAT upheld the findings of NCLT, saying it "has committed no mistake" in finding that the email chain between appellant Air Wave Technocrafts and Voltas "reflects ongoing disputes" regarding work certification, amounts, and supporting documentation.',
 'summary_detail': {'type': 'text/html',
  'language': None,
  'base': 'https://www.etnownews.com/feeds/gns-etn-companies.xml',
  'value'

In [4]:
def fetch_rss_feeds():
    articles = []
    for feed in RSS_FEEDS:
        parsed = feedparser.parse(feed)
        for entry in parsed.entries:
            articles.append({
                "source": feed,
                "url": entry.link,
                "title": entry.title,
                "content": entry.summary,
                "published_at": getattr(entry, "published", None)
            })
    print("Fetching News was successful!")
    return articles

articles = fetch_rss_feeds()
print(f"---\nTotal articles fetched: {len(articles)}")
articles[23]

Fetching News was successful!
---
Total articles fetched: 400


{'source': 'https://www.marketbeat.com/feed/',
 'url': 'https://www.marketbeat.com/stock-ideas/these-2-energy-titans-just-scored-major-wins-to-close-out-november/',
 'title': 'These 2 Energy Titans Just Scored Major Wins to Close Out November',
 'content': 'Constellation Energy and GE Vernova are two AI enablers that have soared in 2025. The companies received more positive news in November.',
 'published_at': 'Wed, 26 Nov 2025 13:21:00 GMT'}

In [5]:
def clean_text(text):
    text = re.sub(r"\s+", " ", text)
    text = text.strip()
    return text

def standardize_article(a):
    return {
        "source": a["source"],
        "url": a["url"],
        "title": clean_text(a["title"]),
        "content": clean_text(a["content"]),
        "published_at": a["published_at"]
    }

standardized_articles = [standardize_article(a) for a in articles]
print("All articles have been standardized")
standardized_articles[23]

All articles have been standardized


{'source': 'https://www.marketbeat.com/feed/',
 'url': 'https://www.marketbeat.com/stock-ideas/these-2-energy-titans-just-scored-major-wins-to-close-out-november/',
 'title': 'These 2 Energy Titans Just Scored Major Wins to Close Out November',
 'content': 'Constellation Energy and GE Vernova are two AI enablers that have soared in 2025. The companies received more positive news in November.',
 'published_at': 'Wed, 26 Nov 2025 13:21:00 GMT'}

In [6]:
os.makedirs("../data", exist_ok=True)

conn = sqlite3.connect("../data/financial_news.db")
print("Connected!")

cur = conn.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS raw_news (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            source TEXT,
            url TEXT,
            title TEXT,
            content TEXT,
            published_at TEXT
            )
""")

conn.commit()
cur.close()
conn.close()

Connected!


In [7]:
def save_articles_to_db(article):
    conn = sqlite3.connect("../data/financial_news.db")
    cur = conn.cursor()

    cur.execute("""
        INSERT INTO raw_news (source, url, title, content, published_at)
        VALUES ( ?, ?, ?, ?, ?)
        """, (
            article["source"],
            article["url"],
            article["title"],
            article["content"],
            article["published_at"]
        ))
    
    conn.commit()
    cur.close()
    conn.close()

for article in standardized_articles:
    save_articles_to_db(article)

print("All articles saved to db.")

All articles saved to db.
