<a href="https://colab.research.google.com/github/atharvavyas1/Finance-N8N-project/blob/main/YahooFinanceRSS_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# ============================================
# FULL ARTICLE CONTENT EXTRACTION
# Extends RSS scraper to fetch complete article text
# ============================================

import feedparser
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re

class YahooFinanceFullArticleScraper:
    """
    Extracts full article content from Yahoo Finance RSS feeds
    Uses RSS for article discovery, then fetches full content from article URLs
    """

    def __init__(self):
        self.base_rss_url = "https://feeds.finance.yahoo.com/rss/2.0/headline?s={}&region=US&lang=en-US"
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}

    def _extract_article_text(self, url):
        """Extract full article text from a Yahoo Finance article URL"""
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            # Find article content - Yahoo Finance uses various selectors
            article_content = None

            # Try common article content selectors
            selectors = [
                'article',
                '[data-module="ArticleBody"]',
                '.caas-body',
                '.article-body',
                '[class*="article"]',
                '[class*="content"]'
            ]

            for selector in selectors:
                article_content = soup.select_one(selector)
                if article_content:
                    break

            if not article_content:
                # Fallback: find main content area
                article_content = soup.find('main') or soup.find('article')

            if article_content:
                # Remove script and style elements
                for script in article_content(["script", "style", "nav", "footer", "header"]):
                    script.decompose()

                # Extract text and clean it
                text = article_content.get_text(separator=' ', strip=True)
                # Clean up multiple whitespaces
                text = re.sub(r'\s+', ' ', text).strip()
                return text

            return None

        except Exception as e:
            return None

    def get_full_articles_for_ticker(self, ticker, max_articles=10, verbose=True):
        """
        Get full article content for a ticker symbol

        Args:
            ticker (str): Stock ticker symbol
            max_articles (int): Maximum number of articles to fetch
            verbose (bool): Print progress messages

        Returns:
            list: List of articles with full text content
        """
        articles = []

        if verbose:
            print(f"Fetching RSS feed for {ticker}...")

        try:
            # Get RSS feed
            feed_url = self.base_rss_url.format(ticker.upper())
            feed = feedparser.parse(feed_url)

            if not feed.entries:
                if verbose:
                    print(f"  ⚠ No articles found for {ticker}")
                return articles

            if verbose:
                print(f"Found {len(feed.entries)} articles in RSS feed")
                print(f"Fetching full content for up to {max_articles} articles...\n")

            # Process articles up to max_articles
            for i, entry in enumerate(feed.entries[:max_articles], 1):
                article_url = entry.get('link', '').strip()
                title = entry.get('title', '').strip()

                if not article_url:
                    continue

                if verbose:
                    title_short = title[:60] + "..." if len(title) > 60 else title
                    print(f"[{i}/{min(len(feed.entries), max_articles)}] Fetching: {title_short}...")

                # Extract full article text
                full_text = self._extract_article_text(article_url)

                # Parse publication date
                published = entry.get('published', '')
                published_datetime = None
                if hasattr(entry, 'published_parsed') and entry.published_parsed:
                    try:
                        published_datetime = datetime(*entry.published_parsed[:6])
                    except:
                        pass

                # Calculate word count
                word_count = len(full_text.split()) if full_text else 0

                article = {
                    'ticker': ticker.upper(),
                    'title': title,
                    'link': article_url,
                    'rss_description': entry.get('summary', '').strip(),
                    'published': published,
                    'published_datetime': published_datetime,
                    'guid': entry.get('guid', ''),
                    'full_text': full_text or '',
                    'word_count': word_count,
                    'has_full_text': full_text is not None and len(full_text) > 0
                }

                articles.append(article)

                if verbose and full_text:
                    print(f"    ✓ Retrieved {word_count} words")
                elif verbose:
                    print(f"    ⚠ Could not extract content")

            return articles

        except Exception as e:
            if verbose:
                print(f"  ✗ Error: {e}")
            return articles

# ============================================
# USAGE EXAMPLE
# ============================================

# Initialize scraper
scraper = YahooFinanceFullArticleScraper()

# Get full articles for a ticker
articles = scraper.get_full_articles_for_ticker('AAPL', max_articles=3)

# Display results
print("\n" + "="*80)
print("FULL ARTICLE CONTENT")
print("="*80)

for i, article in enumerate(articles, 1):
    print(f"\nArticle {i}: {article['title']}")
    print(f"Published: {article['published']}")
    print(f"Word Count: {article['word_count']}")
    print(f"Has Full Text: {article['has_full_text']}")
    if article['full_text']:
        preview = article['full_text'][:200] + "..." if len(article['full_text']) > 200 else article['full_text']
        print(f"\nPreview:\n{preview}")
    print("-"*80)


Fetching RSS feed for AAPL...
Found 20 articles in RSS feed
Fetching full content for up to 3 articles...

[1/3] Fetching: Nvidia’s results ease concerns over AI boom...
    ✓ Retrieved 618 words
[2/3] Fetching: Apple Stock Has Made Investors Rich for 20 Years — What Happ......
    ✓ Retrieved 622 words
[3/3] Fetching: Nvidia's earnings attest to its leadership in the AI race. B......
    ✓ Retrieved 419 words

FULL ARTICLE CONTENT

Article 1: Nvidia’s results ease concerns over AI boom
Published: Wed, 19 Nov 2025 22:23:41 +0000
Word Count: 618
Has Full Text: True

Preview:
Nvidia’s results ease concerns over AI boom Michael Liedtke, Associated Pres Wed 19 November 2025 at 5:23 pm GMT-5 3 min read NVDA AAPL Nvidia’s sales of the computing chips powering artificial intell...
--------------------------------------------------------------------------------

Article 2: Apple Stock Has Made Investors Rich for 20 Years — What Happens Next?
Published: Wed, 19 Nov 2025 21:55:08 +0000
Word Coun

In [23]:
# ============================================
# QUICK USAGE: Extract Full Article Content
# ============================================

# Use the scraper from Cell 4
scraper = YahooFinanceFullArticleScraper()

# Get full articles for a ticker
articles = scraper.get_full_articles_for_ticker('NVDA', max_articles=10)

# Access full article content
for article in articles:
    print(f"\n{'='*80}")
    print(f"Title: {article['title']}")
    print(f"Word Count: {article['word_count']}")
    print(f"Link: {article['link']}")
    print(f"\nFull Text:\n{article['full_text'][:500]}..." if len(article['full_text']) > 500 else f"\nFull Text:\n{article['full_text']}")
    print(f"{'='*80}")

Fetching RSS feed for NVDA...
Found 20 articles in RSS feed
Fetching full content for up to 10 articles...

[1/10] Fetching: Circle, Bitcoin Treasuries Lead Crypto Stock Losses Amid Bit......
    ✓ Retrieved 749 words
[2/10] Fetching: Dow Jones Futures Rise As AI Giant Nvidia Jumps On Earnings;......
    ✓ Retrieved 99 words
[3/10] Fetching: Global Equities Poised for a Complex and Uncertain 2026, Acc......
    ✓ Retrieved 468 words
[4/10] Fetching: Nvidia CEO Huang Says There's No Diversion of Chips Overseas...
    ✓ Retrieved 91 words
[5/10] Fetching: Trump Plans ‘Genesis Mission’ to Boost US AI Development...
    ✓ Retrieved 810 words
[6/10] Fetching: Nvidia CEO Huang on Blackwell Sales, Vera Rubin and China...
    ✓ Retrieved 97 words
[7/10] Fetching: Nvidia rescues Bitcoin after blockbuster earnings...
    ⚠ Could not extract content
[8/10] Fetching: JGBs Fall, Tracking Declines in U.S. Treasurys...
    ✓ Retrieved 71 words
[9/10] Fetching: Nvidia Stock in an AI Bubble? The AI Gia

In [25]:
articles[8]['full_text']

'Shares of Nvidia ( NVDA +2.85% ) are up 4.5% in Wednesday\'s after-hours trading as of 5:57 p.m. ET, following the artificial intelligence (AI) tech leader\'s release of its report for its third quarter of fiscal 2026 (ended Oct. 26, 2025). Investors\' positive reaction is attributable to Q3 revenue and adjusted earnings per share both beating Wall Street\'s estimates, and Q4 guidance for both the top and bottom lines also coming in higher than analysts had expected. The strong guidance was probably the most significant catalyst for Nvidia stock\'s upward move. It\'s an indication that the outlook for the AI market in general remains robust. In recent weeks, Nvidia and other AI stocks have been struggling. Investors have become increasingly concerned about AI stock valuations, fearing that an AI stock bubble may be forming. These struggles were in part caused by the Nov. 4 revelation that Michael Burry, now a former hedge fund manager, took bearish positions (by buying put options) on