<a href="https://colab.research.google.com/github/atharvavyas1/Finance-N8N-project/blob/main/YahooFinanceRSS_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
# ============================================
# FULL ARTICLE CONTENT EXTRACTION
# Extends RSS scraper to fetch complete article text
# ============================================

import feedparser
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re

class YahooFinanceFullArticleScraper:
    """
    Extracts full article content from Yahoo Finance RSS feeds
    Uses RSS for article discovery, then fetches full content from article URLs
    """

    def __init__(self):
        self.base_rss_url = "https://feeds.finance.yahoo.com/rss/2.0/headline?s={}&region=US&lang=en-US"
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}

    def _extract_article_text(self, url):
        """Extract full article text from a Yahoo Finance article URL"""
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            # Find article content - Yahoo Finance uses various selectors
            article_content = None

            # Try common article content selectors
            selectors = [
                'article',
                '[data-module="ArticleBody"]',
                '.caas-body',
                '.article-body',
                '[class*="article"]',
                '[class*="content"]'
            ]

            for selector in selectors:
                article_content = soup.select_one(selector)
                if article_content:
                    break

            if not article_content:
                # Fallback: find main content area
                article_content = soup.find('main') or soup.find('article')

            if article_content:
                # Remove script and style elements
                for script in article_content(["script", "style", "nav", "footer", "header"]):
                    script.decompose()

                # Extract text and clean it
                text = article_content.get_text(separator=' ', strip=True)
                # Clean up multiple whitespaces
                text = re.sub(r'\s+', ' ', text).strip()
                return text

            return None

        except Exception as e:
            return None

    def get_full_articles_for_ticker(self, ticker, max_articles=10, verbose=True):
        """
        Get full article content for a ticker symbol

        Args:
            ticker (str): Stock ticker symbol
            max_articles (int): Maximum number of articles to fetch
            verbose (bool): Print progress messages

        Returns:
            list: List of articles with full text content
        """
        articles = []

        if verbose:
            print(f"Fetching RSS feed for {ticker}...")

        try:
            # Get RSS feed
            feed_url = self.base_rss_url.format(ticker.upper())
            feed = feedparser.parse(feed_url)

            if not feed.entries:
                if verbose:
                    print(f"  ‚ö† No articles found for {ticker}")
                return articles

            if verbose:
                print(f"Found {len(feed.entries)} articles in RSS feed")
                print(f"Fetching full content (filtering for articles >150 words)...\n")

            # Process articles until we have max_articles that meet the word count requirement
            articles_processed = 0
            articles_skipped = 0

            for entry in feed.entries:
                # Stop if we have enough articles
                if len(articles) >= max_articles:
                    break

                articles_processed += 1
                article_url = entry.get('link', '').strip()
                title = entry.get('title', '').strip()

                if not article_url:
                    continue

                if verbose:
                    title_short = title[:60] + "..." if len(title) > 60 else title
                    print(f"[{articles_processed}] Fetching: {title_short}...")

                # Extract full article text
                full_text = self._extract_article_text(article_url)

                # Parse publication date
                published = entry.get('published', '')
                published_datetime = None
                if hasattr(entry, 'published_parsed') and entry.published_parsed:
                    try:
                        published_datetime = datetime(*entry.published_parsed[:6])
                    except:
                        pass

                # Calculate word count
                word_count = len(full_text.split()) if full_text else 0

                # Filter: Only keep articles with more than 150 words
                if word_count <= 150:
                    articles_skipped += 1
                    if verbose:
                        print(f"    ‚ö† Skipped: {word_count} words (minimum 150 required)")
                    continue

                article = {
                    'ticker': ticker.upper(),
                    'title': title,
                    'link': article_url,
                    'rss_description': entry.get('summary', '').strip(),
                    'published': published,
                    'published_datetime': published_datetime,
                    'guid': entry.get('guid', ''),
                    'full_text': full_text or '',
                    'word_count': word_count,
                    'has_full_text': full_text is not None and len(full_text) > 0
                }

                articles.append(article)

                if verbose and full_text:
                    print(f"    ‚úì Retrieved {word_count} words")
                elif verbose:
                    print(f"    ‚ö† Could not extract content")

            # Summary
            if verbose:
                print(f"\nüìä Summary:")
                print(f"   Articles processed: {articles_processed}")
                print(f"   Articles skipped (<150 words): {articles_skipped}")
                print(f"   Articles returned: {len(articles)}")

            return articles

        except Exception as e:
            if verbose:
                print(f"  ‚úó Error: {e}")
            return articles

# ============================================
# USAGE EXAMPLE
# ============================================

# Initialize scraper
scraper = YahooFinanceFullArticleScraper()

# Get full articles for a ticker
articles = scraper.get_full_articles_for_ticker('AAPL', max_articles=5)

# Display results
print("\n" + "="*80)
print("FULL ARTICLE CONTENT")
print("="*80)

for i, article in enumerate(articles, 1):
    print(f"\nArticle {i}: {article['title']}")
    print(f"Published: {article['published']}")
    print(f"Word Count: {article['word_count']}")
    print(f"Has Full Text: {article['has_full_text']}")
    if article['full_text']:
        preview = article['full_text'][:200] + "..." if len(article['full_text']) > 200 else article['full_text']
        print(f"\nPreview:\n{preview}")
    print("-"*80)

Fetching RSS feed for AAPL...
Found 20 articles in RSS feed
Fetching full content (filtering for articles >150 words)...

[1] Fetching: Nvidia‚Äôs results ease concerns over AI boom...
    ‚úì Retrieved 618 words
[2] Fetching: Apple Stock Has Made Investors Rich for 20 Years ‚Äî What Happ......
    ‚úì Retrieved 622 words
[3] Fetching: Nvidia's earnings attest to its leadership in the AI race. B......
    ‚úì Retrieved 419 words
[4] Fetching: Stocks steadier before key Nvidia results as oil slides...
    ‚úì Retrieved 655 words
[5] Fetching: Alphabet Stock Rises, but Falls Short of Passing Microsoft i......
    ‚ö† Skipped: 148 words (minimum 150 required)
[6] Fetching: Apple Is Making Huge China Market Share Wins. Does That Make......
    ‚úì Retrieved 722 words

üìä Summary:
   Articles processed: 6
   Articles skipped (<150 words): 1
   Articles returned: 5

FULL ARTICLE CONTENT

Article 1: Nvidia‚Äôs results ease concerns over AI boom
Published: Wed, 19 Nov 2025 22:23:41 +0000
Wor

In [28]:
# ============================================
# QUICK USAGE: Extract Full Article Content
# ============================================

# Use the scraper from Cell 4
scraper = YahooFinanceFullArticleScraper()

# Get full articles for a ticker
articles = scraper.get_full_articles_for_ticker('NVDA', max_articles=10)

# Access full article content
for article in articles:
    print(f"\n{'='*80}")
    print(f"Title: {article['title']}")
    print(f"Word Count: {article['word_count']}")
    print(f"Link: {article['link']}")
    print(f"\nFull Text:\n{article['full_text'][:500]}..." if len(article['full_text']) > 500 else f"\nFull Text:\n{article['full_text']}")
    print(f"{'='*80}")

Fetching RSS feed for NVDA...
Found 20 articles in RSS feed
Fetching full content (filtering for articles >150 words)...

[1] Fetching: Asian Stocks Get a Lift From Record Nvidia Sales...
    ‚ö† Skipped: 84 words (minimum 150 required)
[2] Fetching: Dow Jones Futures Rise As AI Giant Nvidia Jumps On Earnings;......
    ‚ö† Skipped: 96 words (minimum 150 required)
[3] Fetching: Circle, Bitcoin Treasuries Lead Crypto Stock Losses Amid Bit......
    ‚úì Retrieved 749 words
[4] Fetching: Global Equities Poised for a Complex and Uncertain 2026, Acc......
    ‚úì Retrieved 468 words
[5] Fetching: Nvidia CEO Huang Says There's No Diversion of Chips Overseas...
    ‚ö† Skipped: 91 words (minimum 150 required)
[6] Fetching: Trump Plans ‚ÄòGenesis Mission‚Äô to Boost US AI Development...
    ‚úì Retrieved 810 words
[7] Fetching: Nvidia CEO Huang on Blackwell Sales, Vera Rubin and China...
    ‚ö† Skipped: 97 words (minimum 150 required)
[8] Fetching: Nvidia rescues Bitcoin after blockbuster ear

In [29]:
articles[8]['full_text']

'Nvidia earnings: Wall Street sighs with relief after AI wave doesn‚Äôt crash Edward Helmore Wed, November 19, 2025 at 6:47 PM EST 4 min read NVDA PLTR 9984.T MSFT AMZN Jensen Huang interacts with fans during TSMC‚Äôs annual sports day in Hsinchu, Taiwan, on 8 November. Photograph: Ann Wang/Reuters Markets expectations around Wednesday‚Äôs quarterly earnings report by the most valuable publicly traded company in the world had risen to a fever pitch. Anxiety over billions in investment in artificial intelligence pervaded, in part because the US has been starved of reliable economic data by the recent government shutdown. Investors hoped that both questions would be in part answered by Nvidia‚Äôs earnings and by a jobs report due on Thursday morning. ‚ÄúThis is a ‚ÄòSo goes Nvidia, so goes the market‚Äô kind of report,‚Äù Scott Martin, chief investment officer at Kingsview Wealth Management, told Bloomberg in a concise summary of market sentiment. The prospect of a market mood swing had 