<a href="https://colab.research.google.com/github/atharvavyas1/Finance-N8N-project/blob/main/YahooFinanceRSS_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
pip install feedparser pandas



In [21]:
import feedparser
import pandas as pd
from datetime import datetime
import json
import time

class YahooFinanceRSSScraper:
    """
    Scrape Yahoo Finance news using RSS feeds
    RSS feeds are public and don't require authentication
    """

    def __init__(self):
        self.base_rss_url = "https://feeds.finance.yahoo.com/rss/2.0/headline?s={}&region=US&lang=en-US"

    def get_news_for_ticker(self, ticker, verbose=True):
        """
        Get news for a single ticker symbol

        Args:
            ticker (str): Stock ticker symbol (e.g., 'AAPL')
            verbose (bool): Print status messages

        Returns:
            list: List of news articles
        """
        articles = []

        if verbose:
            print(f"Fetching RSS feed for {ticker}...")

        try:
            # Construct the RSS feed URL
            feed_url = self.base_rss_url.format(ticker.upper())

            # Parse the RSS feed
            feed = feedparser.parse(feed_url)

            # Check if feed was successfully parsed
            if feed.bozo:
                print(f"  Warning: Feed parsing issues for {ticker} - {feed.bozo_exception}")

            if feed.entries:
                for entry in feed.entries:
                    # Extract and clean the data
                    article = {
                        'ticker': ticker.upper(),
                        'title': entry.get('title', '').strip(),
                        'link': entry.get('link', '').strip(),
                        'description': entry.get('summary', '').strip(),
                        'guid': entry.get('guid', '').strip(),
                        'published': entry.get('published', ''),
                    }

                    # Parse the publication date to a more readable format
                    if hasattr(entry, 'published_parsed') and entry.published_parsed:
                        try:
                            article['published_datetime'] = datetime(*entry.published_parsed[:6])
                            article['published_formatted'] = article['published_datetime'].strftime('%Y-%m-%d %H:%M:%S')
                        except:
                            article['published_datetime'] = None
                            article['published_formatted'] = article['published']
                    else:
                        article['published_datetime'] = None
                        article['published_formatted'] = article['published']

                    # Add scraping metadata
                    article['scraped_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

                    articles.append(article)

                if verbose:
                    print(f"  ✓ Found {len(feed.entries)} articles for {ticker}")
            else:
                if verbose:
                    print(f"  ⚠ No articles found for {ticker}")

        except Exception as e:
            print(f"  ✗ Error fetching RSS for {ticker}: {e}")

        return articles

    def get_news_for_multiple_tickers(self, tickers, delay=0.5, verbose=True):
        """
        Get news for multiple ticker symbols

        Args:
            tickers (list): List of ticker symbols
            delay (float): Delay in seconds between requests
            verbose (bool): Print status messages

        Returns:
            dict: Dictionary with tickers as keys and article lists as values
        """
        all_news = {}
        all_articles = []

        # Convert single ticker to list
        if isinstance(tickers, str):
            tickers = [tickers]

        print(f"\nFetching news for {len(tickers)} tickers...")
        print("=" * 50)

        for i, ticker in enumerate(tickers, 1):
            # Get news for this ticker
            articles = self.get_news_for_ticker(ticker, verbose)

            # Store results
            all_news[ticker.upper()] = articles
            all_articles.extend(articles)

            # Add delay between requests (except for last ticker)
            if i < len(tickers) and delay > 0:
                time.sleep(delay)

        # Print summary
        print("=" * 50)
        print(f"\nSummary:")
        print(f"  Total tickers processed: {len(tickers)}")
        print(f"  Total articles found: {len(all_articles)}")

        for ticker, articles in all_news.items():
            print(f"  {ticker}: {len(articles)} articles")

        return all_news, all_articles

    def save_to_csv(self, articles, filename='yahoo_rss_news.csv'):
        """Save articles to CSV file"""
        if not articles:
            print("No articles to save")
            return

        # Convert to DataFrame
        df = pd.DataFrame(articles)

        # Reorder columns for better readability
        columns_order = ['ticker', 'title', 'description', 'published_formatted',
                        'link', 'published', 'published_datetime', 'guid', 'scraped_at']

        # Only include columns that exist
        columns_to_save = [col for col in columns_order if col in df.columns]
        df = df[columns_to_save]

        # Save to CSV
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"\n✓ Saved {len(articles)} articles to {filename}")

        return df

    def save_to_json(self, data, filename='yahoo_rss_news.json'):
        """Save articles to JSON file"""
        if not data:
            print("No data to save")
            return

        # Convert datetime objects to strings for JSON serialization
        def json_serializer(obj):
            if isinstance(obj, datetime):
                return obj.strftime('%Y-%m-%d %H:%M:%S')
            return str(obj)

        # Save to JSON
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False, default=json_serializer)

        article_count = len(data) if isinstance(data, list) else sum(len(v) for v in data.values())
        print(f"✓ Saved {article_count} articles to {filename}")

    def display_articles(self, articles, max_display=5, truncate_desc=True, max_desc_length=None):
        """
        Display articles in a readable format

        Args:
            articles (list): List of articles to display
            max_display (int): Maximum number of articles to show
            truncate_desc (bool): Whether to truncate descriptions
            max_desc_length (int): Maximum description length (None for no limit)
        """
        if not articles:
            print("No articles to display")
            return

        print(f"\nDisplaying {min(len(articles), max_display)} of {len(articles)} articles:")
        print("=" * 80)

        for i, article in enumerate(articles[:max_display], 1):
            print(f"\n{i}. {article['title']}")
            print(f"   Ticker: {article['ticker']}")
            print(f"   Published: {article.get('published_formatted', article.get('published', 'Unknown'))}")

            # Handle description display
            desc = article.get('description', '')
            if desc:
                if truncate_desc and max_desc_length and len(desc) > max_desc_length:
                    desc = desc[:max_desc_length] + "..."
                print(f"   Summary: {desc}")

            # Handle link display
            link = article['link']
            if truncate_desc and len(link) > 80:
                print(f"   Link: {link[:80]}...")
            else:
                print(f"   Link: {link}")

        print("=" * 80)


# ====================
# EXAMPLE USAGE
# ====================

def main():
    """Main function to demonstrate usage"""

    # Initialize the scraper
    scraper = YahooFinanceRSSScraper()

    # Example 1: Single ticker
    print("\n" + "="*50)
    print("EXAMPLE 1: Single Ticker (AAPL)")
    print("="*50)

    apple_news = scraper.get_news_for_ticker('AAPL')
    # Display with full descriptions (no truncation)
    scraper.display_articles(apple_news, max_display=3, truncate_desc=False)

    # Example 2: Multiple tickers
    print("\n" + "="*50)
    print("EXAMPLE 2: Multiple Tickers")
    print("="*50)

    tickers = ['AAPL', 'GOOGL', 'MSFT', 'TSLA', 'AMZN', 'META', 'NVDA']
    all_news, all_articles = scraper.get_news_for_multiple_tickers(tickers)

    # Save results
    print("\nSaving results...")
    scraper.save_to_csv(all_articles)
    scraper.save_to_json(all_news)

    # Example 3: Filter by date
    print("\n" + "="*50)
    print("EXAMPLE 3: Recent Articles (Last 24 hours)")
    print("="*50)

    from datetime import timedelta
    cutoff_date = datetime.now() - timedelta(days=1)

    recent_articles = [
        article for article in all_articles
        if article.get('published_datetime') and article['published_datetime'] > cutoff_date
    ]

    print(f"Found {len(recent_articles)} articles from the last 24 hours")
    # Display recent articles with full content
    scraper.display_articles(recent_articles, max_display=3, truncate_desc=False)

    # Example 4: Search for specific keywords in titles
    print("\n" + "="*50)
    print("EXAMPLE 4: Keyword Search")
    print("="*50)

    keyword = "earnings"
    matching_articles = [
        article for article in all_articles
        if keyword.lower() in article['title'].lower()
    ]

    print(f"Found {len(matching_articles)} articles containing '{keyword}'")
    # Display matching articles with partial truncation
    scraper.display_articles(matching_articles, max_display=3, max_desc_length=300)

    # Create a summary DataFrame
    print("\n" + "="*50)
    print("Creating Summary Report...")
    print("="*50)

    if all_articles:
        df = pd.DataFrame(all_articles)

        # Summary statistics
        print(f"\nNews Summary Statistics:")
        print(f"  Total articles: {len(df)}")
        print(f"  Unique tickers: {df['ticker'].nunique()}")
        print(f"\nArticles per ticker:")
        print(df['ticker'].value_counts())

        # Save summary to Excel (optional)
        try:
            with pd.ExcelWriter('yahoo_news_summary.xlsx') as writer:
                df.to_excel(writer, sheet_name='All News', index=False)
                df['ticker'].value_counts().to_excel(writer, sheet_name='Summary')
            print("\n✓ Saved Excel summary to yahoo_news_summary.xlsx")
        except:
            pass


if __name__ == "__main__":
    # Run the main function
    main()

    # Quick start for custom usage
    print("\n" + "="*50)
    print("QUICK START CODE:")
    print("="*50)
    print("""
# Minimal code to get started:
from yahoo_rss_scraper import YahooFinanceRSSScraper

scraper = YahooFinanceRSSScraper()

# Single ticker
news = scraper.get_news_for_ticker('AAPL')

# Multiple tickers
tickers = ['AAPL', 'GOOGL', 'MSFT']
all_news, all_articles = scraper.get_news_for_multiple_tickers(tickers)

# Save to files
scraper.save_to_csv(all_articles)
scraper.save_to_json(all_news)
    """)


EXAMPLE 1: Single Ticker (AAPL)
Fetching RSS feed for AAPL...
  ✓ Found 20 articles for AAPL

Displaying 3 of 20 articles:

1. Huge AI Deals Keep Markets at Record Highs
   Ticker: AAPL
   Published: 2025-09-22 22:18:00
   Summary: NVIDIA is investing $100 billion into ChatGPT-parent OpenAI's data centers.
   Link: https://finance.yahoo.com/news/huge-ai-deals-keep-markets-221800916.html?.tsrc=rss

2. Tigress Financial Partners Raises Its Price Target on Apple Inc. (AAPL) to $305
   Ticker: AAPL
   Published: 2025-09-22 22:06:04
   Summary: Apple Inc. (NASDAQ:AAPL) is one of the 13 Best Virtual Reality Stocks to Buy Right Now. On September 17, 2025, Tigress Financial Partners raised its price target on Apple Inc. (NASDAQ:AAPL) to $305, keeping a ‘Strong Buy’ rating. The bullish stance stems from the company’s accelerating services growth, aggressive AI innovation, and stronger U.S. supply […]
   Link: https://finance.yahoo.com/news/tigress-financial-partners-raises-price-220604882.html

In [23]:
# Import the required libraries
import feedparser
import pandas as pd
from datetime import datetime
import json
import time

# Copy the YahooFinanceRSSScraper class from the file
# (or save the file as yahoo_rss_scraper.py and import it)

# Initialize the scraper
scraper = YahooFinanceRSSScraper()

# Get news for AAPL
apple_news = scraper.get_news_for_ticker('AAPL')

# Display the results (no truncation - full content)
scraper.display_articles(apple_news, max_display=10, truncate_desc=False)

# Or print each article manually for custom formatting
print(f"\nFound {len(apple_news)} articles for AAPL\n")
print("="*80)

for i, article in enumerate(apple_news, 1):
    print(f"\nArticle {i}:")
    print(f"Title: {article['title']}")
    print(f"Published: {article.get('published_formatted', article['published'])}")
    print(f"Description: {article['description']}")
    print(f"Link: {article['link']}")
    print("-"*80)

# Save to CSV for spreadsheet viewing
scraper.save_to_csv(apple_news, 'aapl_news.csv')

# Save to JSON for programmatic use
scraper.save_to_json(apple_news, 'aapl_news.json')

# Access the raw data as a list of dictionaries
for article in apple_news:
    # Each article is a dictionary with these keys:
    # 'ticker', 'title', 'link', 'description', 'guid',
    # 'published', 'published_datetime', 'published_formatted', 'scraped_at'
    print(article['title'])

Fetching RSS feed for AAPL...
  ✓ Found 20 articles for AAPL

Displaying 10 of 20 articles:

1. Huge AI Deals Keep Markets at Record Highs
   Ticker: AAPL
   Published: 2025-09-22 22:18:00
   Summary: NVIDIA is investing $100 billion into ChatGPT-parent OpenAI's data centers.
   Link: https://finance.yahoo.com/news/huge-ai-deals-keep-markets-221800916.html?.tsrc=rss

2. Tigress Financial Partners Raises Its Price Target on Apple Inc. (AAPL) to $305
   Ticker: AAPL
   Published: 2025-09-22 22:06:04
   Summary: Apple Inc. (NASDAQ:AAPL) is one of the 13 Best Virtual Reality Stocks to Buy Right Now. On September 17, 2025, Tigress Financial Partners raised its price target on Apple Inc. (NASDAQ:AAPL) to $305, keeping a ‘Strong Buy’ rating. The bullish stance stems from the company’s accelerating services growth, aggressive AI innovation, and stronger U.S. supply […]
   Link: https://finance.yahoo.com/news/tigress-financial-partners-raises-price-220604882.html?.tsrc=rss

3. UBS Reiterates Ne


Fetching RSS feed for AAPL...
Found 20 articles in RSS feed
Fetching full content for up to 3 articles...

[1/3] Fetching: Huge AI Deals Keep Markets at Record Highs...
    ✓ Retrieved 671 words
[2/3] Fetching: Tigress Financial Partners Raises Its Price Target on Apple ...
    ✓ Retrieved 213 words
[3/3] Fetching: UBS Reiterates Neutral on Apple (AAPL), Sees Mixed iPhone 17...
    ✓ Retrieved 249 words

FULL ARTICLE CONTENT

Article 1: Huge AI Deals Keep Markets at Record Highs
Published: Mon, 22 Sep 2025 22:18:00 +0000
Word Count: 671
Has Full Text: True

Preview:
Monday, September 22, 2025We started out this opening trading session for the week looking one way — pending reports on home sales, services PMI, durable goods, etc. — and ended it looking quite the other: tech, AI investment and iPhone sales growth. The more things change, the more they stay the same…The Dow grew a modest +66 points, +0.14% today, bested by the S&P 500’s +29 points, +0.44%, the Nasdaq +155 points, +0.69%,

In [27]:
scraper = YahooFinanceFullArticleScraper()

# Get full articles for AAPL (top 5 articles)
articles = scraper.get_full_articles_for_ticker('NVDA', max_articles=10)

# Access the full content
for article in articles:
    print(f"Title: {article['title']}")
    print(f"Full Text: {article['full_text']}")  # <-- THIS IS THE FULL ARTICLE
    print(f"Word Count: {article['word_count']}")


Fetching RSS feed for NVDA...
Found 20 articles in RSS feed
Fetching full content for up to 10 articles...

[1/10] Fetching: Dow Jones Futures: Nvidia, Tesla, Apple Are Big Winners; Fed...
    ✓ Retrieved 20 words
[2/10] Fetching: How major US stock indexes fared Monday, 9/22/2025...
    ✓ Retrieved 187 words
[3/10] Fetching: Nvidia Says All Customers Will Be ‘Priority’ Despite OpenAI ...
    ✓ Retrieved 389 words
[4/10] Fetching: Stock market today: Dow, S&P 500, Nasdaq futures waver after...
    ✓ Retrieved 283 words
[5/10] Fetching: Nvidia Commits $100 Billion to OpenAI in Historic AI Infrast...
    ✓ Retrieved 498 words
[6/10] Fetching: Super Micro Computer Inc. Began Shipping Systems Powered by ...
    ✓ Retrieved 222 words
[7/10] Fetching: Huge AI Deals Keep Markets at Record Highs...
    ✓ Retrieved 671 words
[8/10] Fetching: Nvidia, Abu Dhabi Unveil First AI Robotics Lab...
    ✓ Retrieved 228 words
[9/10] Fetching: Iren Bets $670 Million On Nvidia, AMD Chips For AI Cloud...
 