In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from typing import Dict, List, Optional

class YahooFinanceAnalysisScraper:
    def __init__(self):
        # Set up headers to mimic a real browser request
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive'
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)
    
    def scrape_ticker_analysis(self, ticker: str) -> Dict[str, pd.DataFrame]:
        """
        Scrapes analysis data for a given stock ticker.
        
        Args:
            ticker (str): Stock ticker symbol (e.g., 'PG', 'AAPL')
        
        Returns:
            Dict[str, pd.DataFrame]: Dictionary containing scraped tables
        """
        url = f"https://uk.finance.yahoo.com/quote/{ticker}/analysis/"
        
        try:
            # Make the request
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            # Parse the HTML
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Dictionary to store all scraped tables
            tables_data = {}
            
            # Find earnings estimate section
            earnings_estimate_section = soup.find('section', {'data-testid': 'earningsEstimate'})
            if earnings_estimate_section:
                earnings_estimate_table = self._extract_table_from_section(earnings_estimate_section)
                if earnings_estimate_table is not None:
                    tables_data['earnings_estimate'] = earnings_estimate_table
            
            # Find earnings history section
            earnings_history_section = soup.find('section', {'data-testid': 'earningsHistory'})
            if earnings_history_section:
                earnings_history_table = self._extract_table_from_section(earnings_history_section)
                if earnings_history_table is not None:
                    tables_data['earnings_history'] = earnings_history_table
            
            # Look for other common analysis sections
            other_sections = [
                'revenueEstimate',
                'epsRevisions',
                'epsTrend',
                'growthEstimate'
            ]
            
            for section_name in other_sections:
                section = soup.find('section', {'data-testid': section_name})
                if section:
                    table = self._extract_table_from_section(section)
                    if table is not None:
                        tables_data[section_name] = table
            
            return tables_data
            
        except requests.RequestException as e:
            print(f"Error fetching data for {ticker}: {e}")
            return {}
        except Exception as e:
            print(f"Error processing data for {ticker}: {e}")
            return {}
    
    def _extract_table_from_section(self, section) -> Optional[pd.DataFrame]:
        """
        Extracts table data from a section element.
        
        Args:
            section: BeautifulSoup element containing the section
        
        Returns:
            pd.DataFrame or None: Extracted table data
        """
        try:
            # Find the table within the section
            table = section.find('table')
            if not table:
                return None
            
            # Extract headers
            headers = []
            thead = table.find('thead')
            if thead:
                header_row = thead.find('tr')
                if header_row:
                    headers = [th.get_text(strip=True) for th in header_row.find_all('th')]
            
            # Extract data rows
            rows_data = []
            tbody = table.find('tbody')
            if tbody:
                rows = tbody.find_all('tr')
                for row in rows:
                    cells = row.find_all('td')
                    if cells:
                        row_data = [cell.get_text(strip=True) for cell in cells]
                        rows_data.append(row_data)
            
            # Create DataFrame
            if headers and rows_data:
                df = pd.DataFrame(rows_data, columns=headers)
                return df
            
            return None
            
        except Exception as e:
            print(f"Error extracting table: {e}")
            return None
    
    def scrape_multiple_tickers(self, tickers: List[str], delay: float = 1.0) -> Dict[str, Dict[str, pd.DataFrame]]:
        """
        Scrapes analysis data for multiple tickers.
        
        Args:
            tickers (List[str]): List of ticker symbols
            delay (float): Delay between requests in seconds
        
        Returns:
            Dict[str, Dict[str, pd.DataFrame]]: Nested dictionary with ticker -> table_name -> DataFrame
        """
        all_data = {}
        
        for ticker in tickers:
            print(f"Scraping data for {ticker}...")
            ticker_data = self.scrape_ticker_analysis(ticker)
            if ticker_data:
                all_data[ticker] = ticker_data
                print(f"Successfully scraped {len(ticker_data)} tables for {ticker}")
            else:
                print(f"No data found for {ticker}")
            
            # Add delay to be respectful to the server
            time.sleep(delay)
        
        return all_data
    
    def save_to_csv(self, data: Dict[str, Dict[str, pd.DataFrame]], output_dir: str = "scraped_data"):
        """
        Saves scraped data to CSV files.
        
        Args:
            data: Dictionary containing scraped data
            output_dir: Directory to save CSV files
        """
        import os
        
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        for ticker, tables in data.items():
            for table_name, df in tables.items():
                filename = f"{ticker}_{table_name}.csv"
                filepath = os.path.join(output_dir, filename)
                df.to_csv(filepath, index=False)
                print(f"Saved {filename}")




In [None]:
import requests
from typing import Dict, List, Optional
import pandas as pd

        """
        Args:
            ticker (str): Stock ticker symbol (e.g., 'PG', 'AAPL')
        
        Returns:
            Dict[str, pd.DataFrame]: Dictionary containing scraped tables
        """
        url = f"https://uk.finance.yahoo.com/quote/{ticker}/analysis/"
        
        try:
            # Make the request
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            # Parse the HTML
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Dictionary to store all scraped tables
            tables_data = {}

class YahooFinanceAnalysisScraper:
    def __init__(self):
        # Set up headers to mimic real browser request
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive'
        }

        self.session = requests.Session()
        self.session.headers.update(self.headers)
    
    def scrape_ticker_analysis(self, ticker:str) -> Dict[str, pd.DataFrame]:
        """
        Scrapes analysis data for given stock ticker.
        
        Args:
            ticker (str): Stock ticker symbol (e.g. 'PG', 'AAPL')
        
        Returns:
            Dict [str, pd.DataFrame]

        """

In [None]:

# Initialize the scraper
scraper = YahooFinanceAnalysisScraper()

# Example 1: Scrape a single ticker
print("=== Scraping single ticker (PG) ===")
pg_data = scraper.scrape_ticker_analysis('PG')

# Display the results
for table_name, df in pg_data.items():
    print(f"\n{table_name.upper()} TABLE:")
    print(df.to_string(index=False))

pg_data.keys()

dict_keys(['earnings_estimate', 'earnings_history', 'revenueEstimate', 'epsRevisions', 'epsTrend', 'growthEstimate'])