In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
output_dir = '/content/drive/MyDrive/Speeches'

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

os.chdir(output_dir)

# Verify the current working directory
print(f"Current working directory: {os.getcwd()}")

Mounted at /content/drive
Current working directory: /content/drive/MyDrive/Speeches


In [None]:
pip install requests pandas PyPDF2 pdfplumber pytesseract pillow PyMuPDF beautifulsoup4

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m42.8/42.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m48.5/48.5 kB[0m

In [None]:
# @title ALTERNATIVE SCRAPER USING FRB-BOARD WEBSITE- use this one
#!/usr/bin/env python3
"""
Federal Reserve Speech Scraper (Robust Version)

This script scrapes speeches from Federal Reserve officials and saves them
into separate CSV files for each official.

Requirements:
pip install requests beautifulsoup4 pandas lxml

Usage:
python fed_speech_scraper.py
"""

import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import time
import os
from datetime import datetime
from urllib.parse import urljoin
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__name__)

class FedSpeechScraper:
    def __init__(self, years=None):
        self.base_url = "https://www.federalreserve.gov"
        self.years = years or [2024, 2025]  # Default to recent years
        self.speeches = []
        self.session = requests.Session()

        # Add headers to look like a real browser
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        })

    def test_single_url(self, url):
        """Test a single URL to debug issues"""
        try:
            print(f"Testing URL: {url}")
            response = self.session.get(url, timeout=30)
            print(f"Status code: {response.status_code}")
            print(f"Content length: {len(response.content)}")
            print(f"Content type: {response.headers.get('content-type', 'Unknown')}")

            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                print(f"Title tag: {soup.find('title')}")
                print(f"H1 tags found: {len(soup.find_all('h1'))}")
                print(f"Paragraph tags found: {len(soup.find_all('p'))}")

                # Print first few paragraphs for debugging
                paragraphs = soup.find_all('p')[:3]
                for i, p in enumerate(paragraphs):
                    print(f"P{i+1}: {p.get_text()[:100]}...")

            return response.status_code == 200
        except Exception as e:
            print(f"Test failed: {e}")
            return False

    def get_speech_urls_for_year(self, year):
        """Get all speech URLs for a given year"""
        # Different URL patterns for different year ranges
        if year >= 2009:
            # Modern format: /newsevents/speech/YYYY-speeches.htm
            url = f"{self.base_url}/newsevents/speech/{year}-speeches.htm"
        else:
            # Older format: /newsevents/speech/YYYYspeech.htm
            url = f"{self.base_url}/newsevents/speech/{year}speech.htm"

        print(f"Fetching speeches for {year} from: {url}")

        try:
            response = self.session.get(url, timeout=30)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            speech_urls = []

            # Debug: print page structure
            print(f"Page title: {soup.find('title')}")
            all_links = soup.find_all('a', href=True)
            print(f"Total links found: {len(all_links)}")

            # Find all speech links - different patterns for different eras
            for link in all_links:
                href = link.get('href', '')

                # Modern format: /newsevents/speech/[name][date].htm
                if '/newsevents/speech/' in href and href.endswith('.htm'):
                    # Skip if it's just the year page itself
                    if (f'{year}-speeches.htm' not in href and
                        'speeches.htm' not in href and
                        f'{year}speech.htm' not in href):
                        full_url = urljoin(self.base_url, href)
                        speech_urls.append(full_url)

                # Older format: might have different patterns
                elif year < 2009 and href.endswith('.htm'):
                    # For older years, look for any speech-related links
                    # They might be in different formats
                    if any(keyword in href.lower() for keyword in ['speech', 'remarks', 'testimony']):
                        # Skip navigation and index pages
                        if not any(skip in href.lower() for skip in ['index', 'speeches.htm', f'{year}speech.htm']):
                            full_url = urljoin(self.base_url, href)
                            speech_urls.append(full_url)

            # Remove duplicates
            speech_urls = list(set(speech_urls))

            print(f"Found {len(speech_urls)} speeches for {year}")

            # Show first few URLs for debugging
            print("Sample URLs:")
            for sample_url in speech_urls[:3]:
                print(f"  {sample_url}")

            return speech_urls

        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 404:
                print(f"No speech page found for {year} (404 error)")
                # Try alternative URL patterns for very old years
                if year < 2009:
                    alt_urls = [
                        f"{self.base_url}/boarddocs/speeches/{year}/",
                        f"{self.base_url}/newsevents/speech/{year}/",
                    ]
                    for alt_url in alt_urls:
                        try:
                            print(f"  Trying alternative URL: {alt_url}")
                            response = self.session.get(alt_url, timeout=30)
                            if response.status_code == 200:
                                print(f"  Success with alternative URL!")
                                # Process this page...
                                soup = BeautifulSoup(response.content, 'html.parser')
                                speech_urls = []
                                for link in soup.find_all('a', href=True):
                                    href = link.get('href', '')
                                    if href.endswith('.htm') and 'speech' in href.lower():
                                        full_url = urljoin(self.base_url, href)
                                        speech_urls.append(full_url)
                                return list(set(speech_urls))
                        except:
                            continue
            return []
        except Exception as e:
            print(f"Error fetching {year} speeches: {e}")
            return []

    def safe_get_text(self, element, default=""):
        """Safely get text from an element"""
        try:
            if element is None:
                return default
            return element.get_text(strip=True)
        except:
            return default

    def safe_get_attribute(self, element, attribute, default=""):
        """Safely get an attribute from an element"""
        try:
            if element is None:
                return default
            return element.get(attribute, default)
        except:
            return default

    def extract_speech_details(self, url):
        """Extract speech details from a speech page"""
        try:
            print(f"  Fetching: {url}")
            response = self.session.get(url, timeout=30)
            response.raise_for_status()

            # Check if we got actual content
            if len(response.content) < 1000:
                print(f"  ‚úó Response too short ({len(response.content)} bytes)")
                return None

            soup = BeautifulSoup(response.content, 'html.parser')

            # Debug: Check if we have content
            title_tag = soup.find('title')
            if not title_tag:
                print(f"  ‚úó No title tag found - possible blocked content")
                return None

            print(f"  Page title: {self.safe_get_text(title_tag)[:60]}...")

            # Extract date from URL pattern (e.g., bowman20250809a.htm)
            url_match = re.search(r'/([a-z]+)(\d{8})[a-z]?\.htm', url)
            if not url_match:
                print(f"  Could not parse date from URL: {url}")
                return None

            speaker_code = url_match.group(1)
            date_str = url_match.group(2)
            date = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}"

            # Extract title - try multiple approaches
            title = ""

            # Method 1: Look for h1 tag
            title_elem = soup.find('h1')
            if title_elem:
                title = self.safe_get_text(title_elem)

            # Method 2: Look in page title
            if not title:
                page_title = soup.find('title')
                if page_title:
                    title_text = self.safe_get_text(page_title)
                    # Clean up the title
                    if ' - Federal Reserve Board' in title_text:
                        title = title_text.replace(' - Federal Reserve Board', '').strip()
                    elif 'Speech by' in title_text:
                        # Extract just the speech part
                        parts = title_text.split(' - ')
                        if len(parts) > 1:
                            title = parts[0].replace('Speech by', '').strip()

            # Comprehensive speaker mapping with roles and service years
            speaker_mapping = {
                # Current Era (2018-Present)
                'powell': {'name': 'Jerome H. Powell', 'role': 'Chair', 'years': '2012-present'},
                'bowman': {'name': 'Michelle W. Bowman', 'role': 'Governor/Vice Chair for Supervision', 'years': '2018-present'},
                'waller': {'name': 'Christopher J. Waller', 'role': 'Governor', 'years': '2020-present'},
                'cook': {'name': 'Lisa D. Cook', 'role': 'Governor', 'years': '2022-present'},
                'kugler': {'name': 'Adriana D. Kugler', 'role': 'Governor', 'years': '2023-present'},
                'jefferson': {'name': 'Philip N. Jefferson', 'role': 'Governor/Vice Chair', 'years': '2022-present'},
                'barr': {'name': 'Michael S. Barr', 'role': 'Governor/Vice Chair for Supervision', 'years': '2022-present'},

                # Recent Era (2014-2023)
                'brainard': {'name': 'Lael Brainard', 'role': 'Governor/Vice Chair', 'years': '2014-2023'},
                'clarida': {'name': 'Richard H. Clarida', 'role': 'Vice Chair', 'years': '2018-2022'},
                'quarles': {'name': 'Randal K. Quarles', 'role': 'Governor/Vice Chair for Supervision', 'years': '2017-2021'},
                'fischer': {'name': 'Stanley Fischer', 'role': 'Vice Chair', 'years': '2014-2017'},

                # Obama Era (2009-2017)
                'yellen': {'name': 'Janet L. Yellen', 'role': 'Governor/Vice Chair/Chair', 'years': '2004-2018'},
                'tarullo': {'name': 'Daniel K. Tarullo', 'role': 'Governor', 'years': '2009-2017'},
                'raskin': {'name': 'Sarah Bloom Raskin', 'role': 'Governor', 'years': '2010-2014'},
                'stein': {'name': 'Jeremy C. Stein', 'role': 'Governor', 'years': '2012-2014'},
                'duke': {'name': 'Elizabeth A. Duke', 'role': 'Governor', 'years': '2008-2013'},

                # Financial Crisis Era (2006-2014)
                'bernanke': {'name': 'Ben S. Bernanke', 'role': 'Chair', 'years': '2006-2014'},
                'kohn': {'name': 'Donald L. Kohn', 'role': 'Governor/Vice Chair', 'years': '2002-2010'},
                'kroszner': {'name': 'Randall S. Kroszner', 'role': 'Governor', 'years': '2006-2009'},
                'warsh': {'name': 'Kevin M. Warsh', 'role': 'Governor', 'years': '2006-2011'},
                'mishkin': {'name': 'Frederic S. Mishkin', 'role': 'Governor', 'years': '2006-2008'},

                # Pre-Crisis Era (1987-2006)
                'greenspan': {'name': 'Alan Greenspan', 'role': 'Chair', 'years': '1987-2006'},
                'ferguson': {'name': 'Roger W. Ferguson Jr.', 'role': 'Governor/Vice Chair', 'years': '1997-2006'},
                'bies': {'name': 'Susan Schmidt Bies', 'role': 'Governor', 'years': '2001-2007'},
                'olson': {'name': 'Mark W. Olson', 'role': 'Governor', 'years': '2001-2006'},
                'gramlich': {'name': 'Edward M. Gramlich', 'role': 'Governor', 'years': '1997-2005'},
                'meyer': {'name': 'Laurence H. Meyer', 'role': 'Governor', 'years': '1996-2002'},

                # Alternative name patterns for same officials
                'jerome': {'name': 'Jerome H. Powell', 'role': 'Chair', 'years': '2012-present'},
                'michelle': {'name': 'Michelle W. Bowman', 'role': 'Governor/Vice Chair for Supervision', 'years': '2018-present'},
                'christopher': {'name': 'Christopher J. Waller', 'role': 'Governor', 'years': '2020-present'},
                'lisa': {'name': 'Lisa D. Cook', 'role': 'Governor', 'years': '2022-present'},
                'adriana': {'name': 'Adriana D. Kugler', 'role': 'Governor', 'years': '2023-present'},
                'philip': {'name': 'Philip N. Jefferson', 'role': 'Governor/Vice Chair', 'years': '2022-present'},
                'michael': {'name': 'Michael S. Barr', 'role': 'Governor/Vice Chair for Supervision', 'years': '2022-present'},
                'lael': {'name': 'Lael Brainard', 'role': 'Governor/Vice Chair', 'years': '2014-2023'},
                'richard': {'name': 'Richard H. Clarida', 'role': 'Vice Chair', 'years': '2018-2022'},
                'randal': {'name': 'Randal K. Quarles', 'role': 'Governor/Vice Chair for Supervision', 'years': '2017-2021'},
                'stanley': {'name': 'Stanley Fischer', 'role': 'Vice Chair', 'years': '2014-2017'},
                'janet': {'name': 'Janet L. Yellen', 'role': 'Governor/Vice Chair/Chair', 'years': '2004-2018'},
                'daniel': {'name': 'Daniel K. Tarullo', 'role': 'Governor', 'years': '2009-2017'},
                'sarah': {'name': 'Sarah Bloom Raskin', 'role': 'Governor', 'years': '2010-2014'},
                'jeremy': {'name': 'Jeremy C. Stein', 'role': 'Governor', 'years': '2012-2014'},
                'elizabeth': {'name': 'Elizabeth A. Duke', 'role': 'Governor', 'years': '2008-2013'},
                'ben': {'name': 'Ben S. Bernanke', 'role': 'Chair', 'years': '2006-2014'},
                'donald': {'name': 'Donald L. Kohn', 'role': 'Governor/Vice Chair', 'years': '2002-2010'},
                'randall': {'name': 'Randall S. Kroszner', 'role': 'Governor', 'years': '2006-2009'},
                'kevin': {'name': 'Kevin M. Warsh', 'role': 'Governor', 'years': '2006-2011'},
                'frederic': {'name': 'Frederic S. Mishkin', 'role': 'Governor', 'years': '2006-2008'},
                'alan': {'name': 'Alan Greenspan', 'role': 'Chair', 'years': '1987-2006'},
                'roger': {'name': 'Roger W. Ferguson Jr.', 'role': 'Governor/Vice Chair', 'years': '1997-2006'},
                'susan': {'name': 'Susan Schmidt Bies', 'role': 'Governor', 'years': '2001-2007'},
                'mark': {'name': 'Mark W. Olson', 'role': 'Governor', 'years': '2001-2006'},
                'edward': {'name': 'Edward M. Gramlich', 'role': 'Governor', 'years': '1997-2005'},
                'laurence': {'name': 'Laurence H. Meyer', 'role': 'Governor', 'years': '1996-2002'}
            }

            speaker_info = speaker_mapping.get(speaker_code, {
                'name': speaker_code.title(),
                'role': 'Federal Reserve Official',
                'years': 'TBD'
            })

            speaker = speaker_info['name']
            role = speaker_info['role']
            year_served = speaker_info['years']

            # Extract the main speech text - improved formatting preservation
            text_content = ""

            # Try different content extraction strategies
            content_found = False

            # Strategy 1: Look for specific Fed content containers
            content_selectors = [
                'div[class*="col-xs-12"]',
                'div.content',
                'main',
                'article',
                'div#content'
            ]

            for selector in content_selectors:
                content_div = soup.select_one(selector)
                if content_div:
                    # Remove unwanted elements
                    for unwanted in content_div.select('nav, header, footer, .footnote, sup, .return-to-text, script, style'):
                        unwanted.decompose()

                    # Get text with better formatting preservation
                    text_parts = []

                    # Process all text-containing elements (p, h1, h2, h3, etc.)
                    text_elements = content_div.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

                    for element in text_elements:
                        # Get text and preserve some formatting
                        raw_text = element.get_text(separator=' ', strip=True)

                        # Clean up extra whitespace but preserve sentence structure
                        cleaned_text = re.sub(r'\s+', ' ', raw_text).strip()

                        # Filter out unwanted content
                        if (cleaned_text and
                            len(cleaned_text) > 30 and
                            not any(skip_phrase in cleaned_text.lower() for skip_phrase in [
                                'return to text', 'board of governors', 'federal reserve board',
                                'share', 'pdf', 'accessible version', 'last update', 'back to top',
                                'skip to main content', 'official website'
                            ])):

                            # Add extra spacing for headers
                            if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                                text_parts.append(f"\n{cleaned_text}\n")
                            else:
                                text_parts.append(cleaned_text)

                    if len(text_parts) > 3:  # Ensure we have enough content
                        # Join with proper paragraph spacing
                        text_content = '\n\n'.join(text_parts)
                        # Clean up excessive newlines
                        text_content = re.sub(r'\n{3,}', '\n\n', text_content)
                        content_found = True
                        break

            # Strategy 2: If that didn't work, get all paragraphs from the page
            if not content_found:
                all_text_elements = soup.find_all(['p', 'div'])
                text_parts = []

                for element in all_text_elements:
                    # Get text with better spacing
                    raw_text = element.get_text(separator=' ', strip=True)
                    cleaned_text = re.sub(r'\s+', ' ', raw_text).strip()

                    if (cleaned_text and
                        len(cleaned_text) > 50 and
                        not any(skip_phrase in cleaned_text.lower() for skip_phrase in [
                            'return to text', 'board of governors', 'federal reserve board',
                            'share', 'pdf', 'accessible version', 'last update', 'navigation',
                            'skip to main content', 'official website', '.gov', 'https'
                        ])):
                        text_parts.append(cleaned_text)

                if len(text_parts) > 3:
                    text_content = '\n\n'.join(text_parts)
                    # Clean up excessive newlines
                    text_content = re.sub(r'\n{3,}', '\n\n', text_content)
                    content_found = True

            # Create ID
            names = speaker.split()
            first_initial = names[0][0].lower()
            last_name = names[-1].lower().replace('.', '')
            speech_id = f"{first_initial}{last_name}_{date_str}"

            result = {
                'date': date,
                'speaker': speaker,
                'role': role,
                'year_served': year_served,
                'title': title or f"Speech by {speaker}",
                'text': text_content,
                'url': url,
                'id': speech_id
            }

            if content_found and len(text_content) > 500:
                print(f"  ‚úì SUCCESS: {speaker} - {len(text_content)} chars")
                return result
            else:
                print(f"  ‚úó FAILED: Insufficient content ({len(text_content)} chars)")
                return None

        except requests.exceptions.RequestException as e:
            print(f"  ‚úó Request error: {e}")
            return None
        except Exception as e:
            print(f"  ‚úó Parse error: {e}")
            return None

    def scrape_all_speeches(self, test_first=True):
        """Scrape all speeches for the specified years"""

        # First, test a few URLs to make sure our extraction works
        if test_first:
            print("\n=== TESTING EXTRACTION ===")
            test_urls = [
                "https://www.federalreserve.gov/newsevents/speech/powell20250822a.htm",
                "https://www.federalreserve.gov/newsevents/speech/bowman20250809a.htm"
            ]

            for test_url in test_urls:
                if self.test_single_url(test_url):
                    print("Test passed - proceeding with full scrape")
                    break
            else:
                print("All tests failed - there may be blocking or connectivity issues")
                return

        all_urls = []

        # Collect all URLs first
        for year in self.years:
            urls = self.get_speech_urls_for_year(year)
            all_urls.extend(urls)

        print(f"\nTotal speeches to process: {len(all_urls)}")

        if not all_urls:
            print("No URLs found to process!")
            return

        # Process each speech
        successful = 0
        failed = 0

        for i, url in enumerate(all_urls, 1):
            print(f"\nProcessing speech {i}/{len(all_urls)}:")

            speech_data = self.extract_speech_details(url)
            if speech_data:
                self.speeches.append(speech_data)
                successful += 1
            else:
                failed += 1

            # Be nice to the server
            time.sleep(2)

            # Progress update every 10 speeches
            if i % 10 == 0:
                print(f"\n--- Progress: {i}/{len(all_urls)} processed, {successful} successful, {failed} failed ---")

        print(f"\nFinal Results: {successful} successful, {failed} failed out of {len(all_urls)} total")

    def organize_by_official(self):
        """Organize speeches by official"""
        officials = {}

        for speech in self.speeches:
            speaker = speech['speaker']
            if speaker not in officials:
                officials[speaker] = []
            officials[speaker].append(speech)

        return officials

    def save_to_csv(self):
        """Save speeches to CSV files organized by official"""
        if not self.speeches:
            print("No speeches to save!")
            return

        officials = self.organize_by_official()

        if not os.path.exists('fed_speeches'):
            os.makedirs('fed_speeches')

        for speaker, speeches in officials.items():
            # Create filename
            names = speaker.replace('.', '').split()
            if len(names) >= 2:
                first_name = names[0].lower()
                last_name = names[-1].lower()
                filename = f"{first_name}_{last_name}_speeches.csv"
            else:
                filename = f"{speaker.lower().replace(' ', '_')}_speeches.csv"

            # Sort speeches by date
            speeches_sorted = sorted(speeches, key=lambda x: x['date'], reverse=True)

            # Create DataFrame
            df = pd.DataFrame(speeches_sorted)

            # Save to CSV
            filepath = os.path.join('fed_speeches', filename)
            df.to_csv(filepath, index=False, encoding='utf-8')

            print(f"Saved {len(speeches)} speeches for {speaker} to {filepath}")

        # Also save a combined file - REMOVED per request
        # all_speeches_sorted = sorted(self.speeches, key=lambda x: x['date'], reverse=True)
        # all_df = pd.DataFrame(all_speeches_sorted)
        # all_filepath = os.path.join('fed_speeches', 'all_speeches_combined.csv')
        # all_df.to_csv(all_filepath, index=False, encoding='utf-8')
        # print(f"Saved combined file with {len(self.speeches)} speeches to {all_filepath}")

def main():
    # Specify years to scrape
    years_to_scrape = list(range(2006, 2026))  # 2006 through 2025

    print("Federal Reserve Speech Scraper (Historical Coverage)")
    print("=" * 55)
    print(f"Scraping speeches for years: {years_to_scrape[0]}-{years_to_scrape[-1]}")

    # Create scraper instance
    scraper = FedSpeechScraper(years=years_to_scrape)

    # Scrape all speeches
    scraper.scrape_all_speeches(test_first=True)

    # Save to CSV files
    if scraper.speeches:
        scraper.save_to_csv()
        print(f"\nüéâ SUCCESS! Processed {len(scraper.speeches)} speeches.")
        print("Check the 'fed_speeches' directory for CSV files.")
    else:
        print("\n‚ùå No speeches were successfully scraped.")
        print("This could be due to:")
        print("- Network connectivity issues")
        print("- Website blocking/rate limiting")
        print("- Changes in website structure")
        print("- Firewall or proxy restrictions")

if __name__ == "__main__":
    main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Processing speech 173/1128:
  Fetching: https://www.federalreserve.gov/newsevents/speech/kroszner20081203a.htm
  Page title: The Community Reinvestment Act and the Recent Mortgage Crisi...
  ‚úì SUCCESS: Randall S. Kroszner - 16101 chars

Processing speech 174/1128:
  Fetching: https://www.federalreserve.gov/newsevents/speech/kohn20081015a.htm
  Page title: Economic Outlook - Federal Reserve Board...
  ‚úì SUCCESS: Donald L. Kohn - 18151 chars

Processing speech 175/1128:
  Fetching: https://www.federalreserve.gov/newsevents/speech/kroszner20080901a.htm
  Page title: The United States in the International Financial System:  A ...
  ‚úì SUCCESS: Randall S. Kroszner - 18563 chars

Processing speech 176/1128:
  Fetching: https://www.federalreserve.gov/newsevents/speech/bernanke20081014a.htm
  Page title: Remarks - Federal Reserve Board...
  ‚úì SUCCESS: Ben S. Bernanke - 3326 chars

Processing speech 177/1128:
  Fetching: h

In [1]:
# @title ADDITIONAL CLEANING
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import os
import pandas as pd
import numpy as np
import re
import json
import random
import time
from glob import glob

# Set random seed for reproducibility
seed = int(time.time())
random.seed(seed)
np.random.seed(seed)

# Directory paths
dict_dir = '/content/drive/MyDrive/FedComs/Dictionaries'
input_dir = '/content/drive/MyDrive/FedComs/Speeches/fed_speeches'
cleaned_output_dir = '/content/drive/MyDrive/FedComs/Speeches/fed_speeches_clean'
validation_output_dir = '/content/drive/MyDrive/FedComs/Validation_Sets'
summary_output_dir = '/content/drive/MyDrive/FedComs/Speeches'

# Create output directories if they don't exist
for directory in [cleaned_output_dir, validation_output_dir, summary_output_dir]:
    if not os.path.exists(directory):
        os.makedirs(directory)

os.chdir(summary_output_dir)
print(f"Current working directory: {os.getcwd()}")

# ============================================================================
# STEP 1: CLEAN SPEECHES
# ============================================================================

def remove_video_controls_text(text):
    """Remove the video player control instructions."""
    video_controls_pattern = (
        r'\[Space Bar\] toggles play/pause;'
        r'\[Right/Left Arrows\] seeks the video forwards and back \(5 sec \);'
        r'\[Up/Down Arrows\] increase/decrease volume;'
        r'\[F\] toggles fullscreen on/off \(Except IE 11\);'
        r'The \[Tab\] key may be used in combination with the \[Enter/Return\] key '
        r'to navigate and activate control buttons, such as caption on/off\.;'
    )
    text = re.sub(video_controls_pattern, '', text, flags=re.IGNORECASE)

    # Also handle variations with different spacing/formatting
    text = re.sub(r'\[Space Bar\].*?caption on/off\.;?\s*', '', text, flags=re.IGNORECASE | re.DOTALL)

    return text

def remove_references_section(text):
    """
    Remove bibliography/references section with careful handling.
    Only removes if 'References' appears to be a section header.
    """
    lines = text.split('\n')
    references_idx = -1

    for i, line in enumerate(lines):
        line_stripped = line.strip()

        # Check if line is just "References" or "REFERENCES" (case insensitive)
        if re.match(r'^references?$', line_stripped, re.IGNORECASE):
            # Verify next few lines look like bibliography entries
            next_lines = '\n'.join(lines[i+1:min(i+4, len(lines))])

            # Check for bibliography indicators in next few lines
            bibliography_indicators = [
                r'\d{4}[a-z]?\.?\s',  # Year followed by period/space
                r'\([12]\d{3}[a-z]?\)',  # Year in parentheses
                r'[A-Z][a-z]+,\s+[A-Z]\.',  # Last name, First initial
                r'Journal of',
                r'Review of',
                r'Federal Reserve',
                r'Working Paper',
                r'https?://',
                r'doi:',
            ]

            if any(re.search(pattern, next_lines, re.IGNORECASE) for pattern in bibliography_indicators):
                references_idx = i
                break

    # If we found a References section, remove it
    if references_idx >= 0:
        text = '\n'.join(lines[:references_idx])

    return text

def fix_text_encoding(text):
    """Fix common text encoding issues."""
    text = text.replace('√¢‚Ç¨"', '‚Äî')
    text = text.replace('√¢‚Ç¨"', '‚Äî')
    text = text.replace('√¢‚Ç¨≈ì', '"')
    text = text.replace('√¢‚Ç¨', '"')
    text = text.replace('\u2013', '‚Äì')
    text = text.replace('\u2014', '‚Äî')
    text = text.replace('\u2018', "'")
    text = text.replace('\u2019', "'")
    text = text.replace('\u201c', '"')
    text = text.replace('\u201d', '"')
    text = text.replace('\u2026', '...')
    text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]', '', text)
    return text

def clean_speech_text(text):
    """Clean speech text by fixing encoding and removing unwanted content."""
    text = fix_text_encoding(text)
    text = remove_video_controls_text(text)
    text = remove_references_section(text)

    # Clean up excessive whitespace
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
    text = re.sub(r' +', ' ', text)

    return text.strip()

print("\nCleaning speech files...")
print(f"Reading from: {input_dir}")

# Get all CSV files in the input directory
csv_files = glob(os.path.join(input_dir, '*.csv'))
print(f"Found {len(csv_files)} speech files")

if len(csv_files) == 0:
    print("ERROR: No CSV files found in input directory!")
    print(f"Please check that files exist in: {input_dir}")
else:
    # Process each file
    cleaned_count = 0
    for csv_file in csv_files:
        filename = os.path.basename(csv_file)
        print(f"Processing {filename}...")

        try:
            # Read the speech file
            df = pd.read_csv(csv_file, encoding='utf-8', encoding_errors='replace')

            # Find the text column
            text_col = None
            for col in df.columns:
                if 'text' in col.lower():
                    text_col = col
                    break

            if text_col is None:
                print(f"  Warning: No text column found in {filename}, skipping...")
                continue

            # Clean the text
            df[text_col] = df[text_col].apply(lambda x: clean_speech_text(str(x)) if pd.notna(x) else '')

            # Save cleaned version
            output_file = os.path.join(cleaned_output_dir, filename)
            df.to_csv(output_file, index=False)
            cleaned_count += 1

        except Exception as e:
            print(f"  Error processing {filename}: {e}")
            continue

    print(f"\nCleaned {cleaned_count} speech files")
    print(f"Cleaned files saved to: {cleaned_output_dir}")

# ============================================================================
# STEP 2: LOAD DICTIONARIES
# ============================================================================

print("\n" + "="*70)
print("LOADING DICTIONARIES")
print("="*70)

with open(os.path.join(dict_dir, 'labor_indicators.json'), 'r') as f:
    LABOR_INDICATORS = json.load(f)

with open(os.path.join(dict_dir, 'inflation_indicators.json'), 'r') as f:
    INFLATION_INDICATORS = json.load(f)

with open(os.path.join(dict_dir, 'inflation_pattern_mapping.json'), 'r') as f:
    INFLATION_PATTERN_TO_INDICATOR = json.load(f)

print("Dictionaries loaded successfully!")
print(f"Labor indicators: {list(LABOR_INDICATORS.keys())}")
print(f"Inflation categories: {list(INFLATION_INDICATORS.keys())}")

# ============================================================================
# STEP 3: SENTENCE SPLITTING AND CLASSIFICATION FUNCTIONS
# ============================================================================

def split_into_sentences(text):
    """Split text into sentences, preserving initials and abbreviations."""
    text = fix_text_encoding(text)

    abbreviations = [
        r'\bU\.S\.A\.', r'\bU\.S\.', r'\bU\.K\.', r'\bE\.U\.',
        r'\bSt\.', r'\bMr\.', r'\bMrs\.', r'\bMs\.', r'\bDr\.',
        r'\bProf\.', r'\bSr\.', r'\bJr\.', r'\bvs\.', r'\betc\.',
        r'\bi\.e\.', r'\be\.g\.', r'\bVol\.', r'\bNo\.', r'\bpp\.',
        r'\bCo\.', r'\bInc\.', r'\bLtd\.', r'\bCorp\.',
        r'\bPh\.D\.', r'\bM\.A\.', r'\bM\.S\.', r'\bB\.A\.',
        r'\bD\.C\.', r'\bA\.M\.', r'\bP\.M\.'
    ]

    for idx, abbr in enumerate(abbreviations):
        text = re.sub(abbr, f'<ABBR_{idx}>', text, flags=re.IGNORECASE)

    text = re.sub(r'\b([A-Z])\.(\s+[A-Z]\.)*(?=\s+[A-Z][a-z]+)', lambda m: m.group(0).replace('.', f'<NAME>'), text)
    text = re.sub(r'\b\d+\.\d+\b', lambda m: m.group(0).replace('.', '<DEC>'), text)

    voting_pattern = r'((?:Voting for|Voting against)\s+[^.!?]+?)([.!?]+\s+|$)'
    voting_matches = []
    def store_voting_match(match):
        voting_matches.append(match.group(1))
        return f'<VOTE_{len(voting_matches) - 1}>'
    text = re.sub(voting_pattern, store_voting_match, text)

    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z]|$)', text)
    sentences = [s.strip() for s in sentences if s.strip()]

    restored_sentences = []
    for sentence in sentences:
        for idx in range(len(abbreviations)):
            sentence = sentence.replace(f'<ABBR_{idx}>', abbreviations[idx].replace(r'\b', '').replace(r'\.', '.'))
        sentence = sentence.replace('<NAME>', '.')
        sentence = sentence.replace('<DEC>', '.')
        for i, voting_list in enumerate(voting_matches):
            placeholder = f'<VOTE_{i}>'
            if placeholder in sentence:
                sentence = sentence.replace(placeholder, voting_list)
        restored_sentences.append(sentence)

    return restored_sentences

def check_keywords_in_sentence(sentence, keywords):
    """Check if any keyword appears in the sentence."""
    sentence_lower = sentence.lower()
    for keyword in keywords:
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        if re.search(pattern, sentence_lower):
            return True
    return False

def check_employment_indicator(sentence, keywords):
    """Check for Employment indicator, excluding maximum/full employment."""
    sentence_lower = sentence.lower()

    if re.search(r'\b(?:maximum|full)\s+employment\b', sentence_lower):
        return False
    if re.search(r'\bemployment\s+goal\b', sentence_lower):
        return False

    for keyword in keywords:
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        if re.search(pattern, sentence_lower):
            return True
    return False

def check_general_labor_term(sentence):
    """Check if sentence contains general labor terms."""
    sentence_lower = sentence.lower()
    general_labor_keywords = LABOR_INDICATORS.get("General Labor", [])
    for keyword in general_labor_keywords:
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        if re.search(pattern, sentence_lower):
            return True
    return False

def check_general_inflation_terms(sentence):
    """Check if sentence contains general inflation terms."""
    sentence_lower = sentence.lower()
    general_inflation_patterns = INFLATION_INDICATORS.get("General Inflation", {}).get("general_patterns", [])
    for pattern in general_inflation_patterns:
        if re.search(pattern, sentence_lower, re.IGNORECASE):
            return True
    return False

def check_inflation_sentence(sentence):
    """Check if sentence mentions any inflation indicator."""
    mentioned_indicators = set()
    sentence_lower = sentence.lower()

    for category, subcategories in INFLATION_INDICATORS.items():
        for pattern_name, pattern_list in subcategories.items():
            for pattern in pattern_list:
                if re.search(pattern, sentence_lower, re.IGNORECASE):
                    indicator_name = INFLATION_PATTERN_TO_INDICATOR.get(pattern_name, "Other")
                    mentioned_indicators.add(indicator_name)
                    break

    if "Core_CPI" in mentioned_indicators and "Core" in mentioned_indicators:
        mentioned_indicators.discard("Core")

    if "Core_PCE" in mentioned_indicators and "Core" in mentioned_indicators:
        mentioned_indicators.discard("Core")

    if "Headline_CPI" in mentioned_indicators and "Headline" in mentioned_indicators:
        mentioned_indicators.discard("Headline")

    if "Headline_PCE" in mentioned_indicators and "Headline" in mentioned_indicators:
        mentioned_indicators.discard("Headline")

    return mentioned_indicators

def classify_sentence(sentence):
    """Classify a single sentence and return its indicators."""
    labor_specific_found = False
    labor_indicators_in_sentence = set()

    for indicator, keywords in LABOR_INDICATORS.items():
        if indicator == "General Labor":
            continue

        if indicator == "Employment":
            if check_employment_indicator(sentence, keywords):
                labor_indicators_in_sentence.add(indicator)
                labor_specific_found = True
        else:
            if check_keywords_in_sentence(sentence, keywords):
                labor_indicators_in_sentence.add(indicator)
                labor_specific_found = True

    labor_general_found = check_general_labor_term(sentence)
    labor_found = labor_specific_found or labor_general_found

    inflation_indicators_in_sentence = check_inflation_sentence(sentence)
    inflation_specific_found = bool(inflation_indicators_in_sentence)

    inflation_general_found = check_general_inflation_terms(sentence)
    inflation_found = inflation_specific_found or inflation_general_found

    if labor_found and inflation_found:
        classification = "Both"
    elif labor_found:
        classification = "Labor"
    elif inflation_found:
        classification = "Inflation"
    else:
        classification = "Neither"

    return {
        'classification': classification,
        'labor_indicators': list(labor_indicators_in_sentence),
        'inflation_indicators': list(inflation_indicators_in_sentence)
    }

def analyze_speech(text):
    """Analyze a single speech for labor and inflation content."""
    sentences = split_into_sentences(text)
    total_sentences = len(sentences)

    labor_sentences = 0
    inflation_sentences = 0
    both_sentences = 0

    labor_indicator_counts = {indicator: 0 for indicator in LABOR_INDICATORS.keys() if indicator != "General Labor"}
    inflation_indicator_list = sorted(list(set(
        indicator for indicator in INFLATION_PATTERN_TO_INDICATOR.values()
        if indicator not in ["General_Inflation", "Other"]
    )))
    inflation_indicator_counts = {indicator: 0 for indicator in inflation_indicator_list}

    sentence_data_list = []

    for sent_idx, sentence in enumerate(sentences):
        classification_result = classify_sentence(sentence)

        labor_indicators_filtered = [ind for ind in classification_result['labor_indicators']
                                      if ind != "General Labor"]
        inflation_indicators_filtered = [ind for ind in classification_result['inflation_indicators']
                                          if ind not in ["General_Inflation", "Other"]]

        sentence_data = {
            'sentence_number': sent_idx + 1,
            'sentence_text': sentence,
            'classification': classification_result['classification'],
            'labor_indicators': ', '.join(sorted(labor_indicators_filtered)) if labor_indicators_filtered else '',
            'inflation_indicators': ', '.join(sorted(inflation_indicators_filtered)) if inflation_indicators_filtered else ''
        }
        sentence_data_list.append(sentence_data)

        labor_specific_found = bool(classification_result['labor_indicators'])
        labor_general_found = check_general_labor_term(sentence)
        labor_found = labor_specific_found or labor_general_found

        inflation_specific_found = bool(classification_result['inflation_indicators'])
        inflation_general_found = check_general_inflation_terms(sentence)
        inflation_found = inflation_specific_found or inflation_general_found

        if labor_found and inflation_found:
            both_sentences += 1
            labor_sentences += 1
            inflation_sentences += 1
        elif labor_found:
            labor_sentences += 1
        elif inflation_found:
            inflation_sentences += 1

        for indicator in classification_result['labor_indicators']:
            if indicator in labor_indicator_counts:
                labor_indicator_counts[indicator] += 1

        for indicator in classification_result['inflation_indicators']:
            if indicator in inflation_indicator_counts:
                inflation_indicator_counts[indicator] += 1

    total_labor_mentions = sum(labor_indicator_counts.values())
    total_inflation_mentions = sum(inflation_indicator_counts.values())

    labor_emphasis = {}
    for indicator, count in labor_indicator_counts.items():
        labor_emphasis[f"labor_emphasis_{indicator}"] = count / total_labor_mentions if total_labor_mentions > 0 else 0

    inflation_emphasis = {}
    for indicator, count in inflation_indicator_counts.items():
        inflation_emphasis[f"inflation_emphasis_{indicator}"] = count / total_inflation_mentions if total_inflation_mentions > 0 else 0

    labor_sentence_share = {}
    for indicator, count in labor_indicator_counts.items():
        labor_sentence_share[f"labor_share_total_sentences_{indicator}"] = count / total_sentences if total_sentences > 0 else 0

    inflation_sentence_share = {}
    for indicator, count in inflation_indicator_counts.items():
        inflation_sentence_share[f"inflation_share_total_sentences_{indicator}"] = count / total_sentences if total_sentences > 0 else 0

    labor_inflation_total = labor_sentences + inflation_sentences - both_sentences
    labor_share_of_labor_inflation = labor_sentences / labor_inflation_total if labor_inflation_total > 0 else 0

    summary_results = {
        'sentences_on_labor': labor_sentences,
        'sentences_on_inflation': inflation_sentences,
        'sentences_on_both': both_sentences,
        'total_sentences': total_sentences,
        'labor_share_of_labor_inflation_sentences': labor_share_of_labor_inflation
    }

    for indicator, count in labor_indicator_counts.items():
        summary_results[f'labor_{indicator}_count'] = count

    for indicator, count in inflation_indicator_counts.items():
        summary_results[f'inflation_{indicator}_count'] = count

    summary_results.update(labor_emphasis)
    summary_results.update(inflation_emphasis)
    summary_results.update(labor_sentence_share)
    summary_results.update(inflation_sentence_share)

    return summary_results, sentence_data_list

# ============================================================================
# STEP 4: CLASSIFY SPEECH CONTENT
# ============================================================================

print("\n" + "="*70)
print("CLASSIFYING SPEECH CONTENT")
print("="*70)

cleaned_files = glob(os.path.join(cleaned_output_dir, '*.csv'))
print(f"Found {len(cleaned_files)} cleaned speech files to classify")

results_list = []
all_sentences = []

for idx, csv_file in enumerate(cleaned_files):
    filename = os.path.basename(csv_file)

    if idx % 5 == 0 or len(cleaned_files) <= 10:
        print(f"Processing file {idx+1}/{len(cleaned_files)}: {filename}")

    try:
        df = pd.read_csv(csv_file, encoding='utf-8', encoding_errors='replace')

        text_col = None
        for col in df.columns:
            if 'text' in col.lower():
                text_col = col
                break

        if text_col is None:
            print(f"  Warning: No text column in {filename}, skipping...")
            continue

        for row_idx, row in df.iterrows():
            if pd.isna(row[text_col]):
                text = ''
            else:
                text = str(row[text_col])

            if len(text.strip()) == 0:
                continue

            summary_results, sentence_data_list = analyze_speech(text)

            for col in df.columns:
                col_lower = col.lower()
                if col_lower not in ['text', 'speech_text']:
                    summary_results[col] = str(row[col]) if pd.notna(row[col]) else ''

            results_list.append(summary_results)

            for sentence_data in sentence_data_list:
                official_name = ''
                for name_col in ['official_name', 'name', 'Name', 'speaker', 'Speaker']:
                    if name_col in row and pd.notna(row[name_col]):
                        official_name = str(row[name_col])
                        break

                sentence_data['official_name'] = official_name

                date_val = ''
                for date_col in ['date', 'Date']:
                    if date_col in row and pd.notna(row[date_col]):
                        date_val = str(row[date_col])
                        break
                sentence_data['date'] = date_val

                all_sentences.append(sentence_data)

    except Exception as e:
        print(f"  Error processing {filename}: {e}")
        import traceback
        print(f"  Full traceback: {traceback.format_exc()}")
        continue

# Create summary dataframe
results_df = pd.DataFrame(results_list)

if len(results_df) > 0:
    priority_cols = ['date', 'Date', 'official_name', 'name', 'Name', 'title', 'Title']
    first_cols = [col for col in priority_cols if col in results_df.columns]
    other_cols = [col for col in results_df.columns if col not in first_cols]
    results_df = results_df[first_cols + other_cols]

    date_col = None
    for col in ['date', 'Date']:
        if col in results_df.columns:
            date_col = col
            break

    if date_col:
        try:
            results_df = results_df.sort_values(date_col)
        except:
            pass

    summary_output_file = os.path.join(summary_output_dir, 'speeches_content.csv')
    results_df.to_csv(summary_output_file, index=False)
    print(f"\nSummary dataset saved to: {summary_output_file}")
    print(f"Shape: {results_df.shape}")

    print("\n" + "="*70)
    print("SUMMARY STATISTICS")
    print("="*70)
    print(f"\nNumber of speeches analyzed: {len(results_df)}")
    print(f"\nAverage sentences per speech: {results_df['total_sentences'].mean():.1f}")
    print(f"Average labor sentences: {results_df['sentences_on_labor'].mean():.1f}")
    print(f"Average inflation sentences: {results_df['sentences_on_inflation'].mean():.1f}")
    print(f"Average sentences on both: {results_df['sentences_on_both'].mean():.1f}")
    print(f"Average labor share of labor/inflation: {results_df['labor_share_of_labor_inflation_sentences'].mean():.2%}")

    labor_emphasis_cols = [col for col in results_df.columns if col.startswith('labor_emphasis_')]
    print("\n" + "-"*70)
    print("AVERAGE LABOR EMPHASIS VECTORS")
    print("-"*70)
    for col in sorted(labor_emphasis_cols):
        indicator_name = col.replace('labor_emphasis_', '')
        avg_emphasis = results_df[col].mean()
        print(f"{indicator_name:20s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")

    total_labor_emphasis = results_df[labor_emphasis_cols].mean().sum()
    print(f"\n{'Total':20s}: {total_labor_emphasis:.4f}")

    inflation_emphasis_cols = [col for col in results_df.columns if col.startswith('inflation_emphasis_')]
    print("\n" + "-"*70)
    print("AVERAGE INFLATION EMPHASIS VECTORS")
    print("-"*70)
    for col in sorted(inflation_emphasis_cols):
        indicator_name = col.replace('inflation_emphasis_', '')
        avg_emphasis = results_df[col].mean()
        print(f"{indicator_name:20s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")

    total_inflation_emphasis = results_df[inflation_emphasis_cols].mean().sum()
    print(f"\n{'Total':20s}: {total_inflation_emphasis:.4f}")

    print("\n" + "-"*70)
    print("OFFICIALS BY LABOR SHARE OF LABOR/INFLATION")
    print("-"*70)

    official_col = None
    for col in ['official_name', 'name', 'Name', 'speaker', 'Speaker']:
        if col in results_df.columns:
            official_col = col
            break

    if official_col:
        official_labor_share = results_df.groupby(official_col)['labor_share_of_labor_inflation_sentences'].agg(['mean', 'count'])
        official_labor_share = official_labor_share[official_labor_share['count'] >= 2]
        official_labor_share = official_labor_share.sort_values('mean', ascending=False)

        if len(official_labor_share) >= 4:
            print("\nTop 4 Officials (Highest Labor Share):")
            top_4 = official_labor_share.head(4)
            for idx, (official, row) in enumerate(top_4.iterrows(), 1):
                print(f"{idx}. {official:30s}: {row['mean']:.2%} (n={int(row['count'])})")

            print("\nBottom 4 Officials (Lowest Labor Share):")
            bottom_4 = official_labor_share.tail(4)
            for idx, (official, row) in enumerate(bottom_4.iterrows(), 1):
                print(f"{idx}. {official:30s}: {row['mean']:.2%} (n={int(row['count'])})")

            print("\n" + "="*70)
            print("EMPHASIS VECTORS FOR TOP 4 OFFICIALS")
            print("="*70)

            for official_name in top_4.index:
                official_speeches = results_df[results_df[official_col] == official_name]

                print(f"\n{official_name} (Labor Share: {top_4.loc[official_name, 'mean']:.2%})")
                print("-" * 70)

                print("Labor Emphasis:")
                for col in sorted(labor_emphasis_cols):
                    indicator_name = col.replace('labor_emphasis_', '')
                    avg_emphasis = official_speeches[col].mean()
                    if avg_emphasis > 0:
                        print(f"  {indicator_name:25s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")

                print("\nInflation Emphasis:")
                for col in sorted(inflation_emphasis_cols):
                    indicator_name = col.replace('inflation_emphasis_', '')
                    avg_emphasis = official_speeches[col].mean()
                    if avg_emphasis > 0:
                        print(f"  {indicator_name:25s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")

            print("\n" + "="*70)
            print("EMPHASIS VECTORS FOR BOTTOM 4 OFFICIALS")
            print("="*70)

            for official_name in bottom_4.index:
                official_speeches = results_df[results_df[official_col] == official_name]

                print(f"\n{official_name} (Labor Share: {bottom_4.loc[official_name, 'mean']:.2%})")
                print("-" * 70)

                print("Labor Emphasis:")
                for col in sorted(labor_emphasis_cols):
                    indicator_name = col.replace('labor_emphasis_', '')
                    avg_emphasis = official_speeches[col].mean()
                    if avg_emphasis > 0:
                        print(f"  {indicator_name:25s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")

                print("\nInflation Emphasis:")
                for col in sorted(inflation_emphasis_cols):
                    indicator_name = col.replace('inflation_emphasis_', '')
                    avg_emphasis = official_speeches[col].mean()
                    if avg_emphasis > 0:
                        print(f"  {indicator_name:25s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")
        else:
            print(f"\nNot enough officials with multiple speeches (found {len(official_labor_share)})")
    else:
        print("\nCould not find official name column")

else:
    print("\nWarning: No results to save!")

# Create sentence-level dataframe and validation set
sentences_df = pd.DataFrame(all_sentences)

if len(sentences_df) > 0:
    print(f"\nTotal sentences extracted: {len(sentences_df)}")
    print("\nClassification distribution:")
    print(sentences_df['classification'].value_counts())

    n_labor = 15
    n_inflation = 15
    n_both = 5
    n_neither = 10

    print(f"\nSampling sentences for validation...")
    validation_samples = []

    labor_sentences = sentences_df[sentences_df['classification'] == 'Labor']
    if len(labor_sentences) >= n_labor:
        validation_samples.append(labor_sentences.sample(n=n_labor, random_state=seed))
    else:
        print(f"Warning: Only {len(labor_sentences)} labor sentences available")
        if len(labor_sentences) > 0:
            validation_samples.append(labor_sentences)

    inflation_sentences = sentences_df[sentences_df['classification'] == 'Inflation']
    if len(inflation_sentences) >= n_inflation:
        validation_samples.append(inflation_sentences.sample(n=n_inflation, random_state=seed))
    else:
        print(f"Warning: Only {len(inflation_sentences)} inflation sentences available")
        if len(inflation_sentences) > 0:
            validation_samples.append(inflation_sentences)

    both_sentences = sentences_df[sentences_df['classification'] == 'Both']
    if len(both_sentences) >= n_both:
        validation_samples.append(both_sentences.sample(n=n_both, random_state=seed))
    else:
        print(f"Warning: Only {len(both_sentences)} both sentences available")
        if len(both_sentences) > 0:
            validation_samples.append(both_sentences)

    neither_sentences = sentences_df[sentences_df['classification'] == 'Neither']
    if len(neither_sentences) >= n_neither:
        validation_samples.append(neither_sentences.sample(n=n_neither, random_state=seed))
    else:
        print(f"Warning: Only {len(neither_sentences)} neither sentences available")
        if len(neither_sentences) > 0:
            validation_samples.append(neither_sentences)

    if validation_samples:
        validation_df = pd.concat(validation_samples, ignore_index=True)
        validation_df = validation_df.sample(frac=1, random_state=seed).reset_index(drop=True)

        validation_output_file = os.path.join(validation_output_dir, 'speeches_validate.csv')
        validation_df.to_csv(validation_output_file, index=False)

        print(f"\nValidation set created: {validation_output_file}")
        print(f"Total sentences in validation set: {len(validation_df)}")
        print(f"\nValidation set distribution:")
        print(validation_df['classification'].value_counts())

        print("\n" + "="*70)
        print("SAMPLE VALIDATION SENTENCES (10 examples)")
        print("="*70)

        sample_display = validation_df.head(10)
        for idx, row in sample_display.iterrows():
            print(f"\n[{idx+1}] Classification: {row['classification']}")
            if row['official_name']:
                print(f"    Official: {row['official_name']}")
            if row['labor_indicators']:
                print(f"    Labor Indicators: {row['labor_indicators']}")
            if row['inflation_indicators']:
                print(f"    Inflation Indicators: {row['inflation_indicators']}")
            print(f"    Sentence: {row['sentence_text'][:200]}{'...' if len(row['sentence_text']) > 200 else ''}")
    else:
        print("\nNo validation samples available")

print("\n" + "="*70)
print("PROCESSING COMPLETE!")
print("="*70)

MessageError: Error: credential propagation was unsuccessful

In [None]:
# @title FRED SCRAPER. This one has lots of issues in formatting, just for archival purposes.
#!/usr/bin/env python3
"""
Federal Reserve Officials Speech Scraper
Collects speeches from Fed Governors and Regional Presidents
"""

import requests
import csv
import json
import time
import re
import os
from datetime import datetime
import uuid
from collections import defaultdict

# Configuration
API_KEY = "6ebaa277c3f1d751e899c615816470a9"
BASE_URL = "https://fraser.stlouisfed.org/api"
OUTPUT_DIR = '/content/drive/MyDrive/Speeches/FREDspeeches'

# Option to only scrape missing officials (set to True to only scrape missing CSVs)
ONLY_SCRAPE_MISSING = True

# Federal Reserve Officials - with FRASER title IDs where available
OFFICIALS = {
    # Board of Governors
    "Alan Greenspan": {"role": "Chair", "years": "1987-2006", "title_id": 452},
    "Susan Bies": {"role": "Governor", "years": "2001-2007", "title_id": 955},
    "Mark Olson": {"role": "Governor", "years": "2001-2006", "title_id": 941},
    "Edward Gramlich": {"role": "Governor", "years": "1997-2005", "title_id": 914},
    "Roger Ferguson": {"role": "Governor/Vice Chair", "years": "1997-2006", "title_id": 950},
    "Laurence Meyer": {"role": "Governor", "years": "1996-2002", "title_id": 936},
    "Donald Kohn": {"role": "Governor/Vice Chair", "years": "2002-2010", "title_id": 464},
    "Ben Bernanke": {"role": "Chair", "years": "2006-2014", "title_id": 453},
    "Janet Yellen": {"role": "Governor/Vice Chair/Chair", "years": "2004-2018", "title_id": 930},
    "Jerome Powell": {"role": "Governor/Chair", "years": "2012-Present", "title_id": 1164},
    "Randall Kroszner": {"role": "Governor", "years": "2006-2009", "title_id": 948},
    "Kevin Warsh": {"role": "Governor", "years": "2006-2011", "title_id": 935},
    "Frederic Mishkin": {"role": "Governor", "years": "2006-2008", "title_id": 919},
    "Elizabeth Duke": {"role": "Governor", "years": "2008-2013", "title_id": 916},
    "Daniel Tarullo": {"role": "Governor", "years": "2009-2017", "title_id": 910},
    "Sarah Bloom Raskin": {"role": "Governor", "years": "2010-2014", "title_id": 951},
    "Lael Brainard": {"role": "Governor/Vice Chair", "years": "2014-2023", "title_id": 3777},
    "Stanley Fischer": {"role": "Vice Chair", "years": "2014-2017", "title_id": 3778},
    "Jeremy Stein": {"role": "Governor", "years": "2012-2014", "title_id": 1163},
    "Michelle Bowman": {"role": "Governor", "years": "2018-Present", "title_id": 6098},
    "Richard Clarida": {"role": "Vice Chair", "years": "2018-2022", "title_id": 5997},
    "Randal Quarles": {"role": "Governor/Vice Chair for Supervision", "years": "2017-2021", "title_id": 5732},
    "Christopher Waller": {"role": "Governor", "years": "2020-Present", "title_id": 6421},
    "Michael Barr": {"role": "Governor/Vice Chair for Supervision", "years": "2022-Present", "title_id": 6862},
    "Lisa Cook": {"role": "Governor", "years": "2022-Present", "title_id": 6861},
    "Adriana Kugler": {"role": "Governor", "years": "2023-Present", "title_id": 9290},
    "Philip Jefferson": {"role": "Governor/Vice Chair", "years": "2022-Present", "title_id": 6860},

    # Regional Presidents
    # "William McDonough": {"role": "New York President", "years": "1993-2003", "title_id": 6748},
    # "Edward G. Boehne": {"role": "Philadelphia President", "years": "1981-2000", "title_id": 6108},
    # "Jerry Jordan": {"role": "Cleveland President", "years": "1992-2003", "title_id": 3769},
    # "Alfred Broaddus": {"role": "Richmond President", "years": "1993-2004", "title_id": 9267},
    # "Michael Moskow": {"role": "Chicago President", "years": "1994-2007", "title_id": 5967},
    # "William Poole": {"role": "St. Louis President", "years": "1998-2008", "title_id": 485},
    # "Neel Kashkari": {"role": "Minneapolis President", "years": "2016-Present", "title_id": 9361},
    # "Robert McTeer": {"role": "Dallas President", "years": "1991-2004", "title_id": 6144},
    # "Cathy Minehan": {"role": "Boston President", "years": "1994-2007", "title_id": 9017},
    # "Anthony Santomero": {"role": "Philadelphia President", "years": "2000-2006", "title_id": 6109},
    # "Jack Guynn": {"role": "Atlanta President", "years": "1996-2007", "title_id": 5170},
    # "Robert Parry": {"role": "San Francisco President", "years": "1986-2004", "title_id": 1270},
    # "Eric Rosengren": {"role": "Boston President", "years": "2007-2021", "title_id": 9015},
    # "Susan Collins": {"role": "Boston President", "years": "2022-Present", "title_id": 9016},
    # "Timothy Geithner": {"role": "New York President", "years": "2003-2009", "title_id": 6750},
    # "William Dudley": {"role": "New York President", "years": "2009-2018", "title_id": 6749},
    # "John Williams": {"role": "New York President", "years": "2011-Present", "title_id": 9040},
    # "Charles Plosser": {"role": "Philadelphia President", "years": "2006-2015", "title_id": 6101},
    # "Patrick Harker": {"role": "Philadelphia President", "years": "2015-Present", "title_id": 6102},
    # "Sandra Pianalto": {"role": "Cleveland President", "years": "2003-2014", "title_id": 3770},
    # "Loretta Mester": {"role": "Cleveland President", "years": "2014-2024", "title_id": 9033},
    # "Jeffrey Lacker": {"role": "Richmond President", "years": "2004-2017", "title_id": 6827},
    # "Thomas Barkin": {"role": "Richmond President", "years": "2018-Present", "title_id": 9266},
    # "Dennis Lockhart": {"role": "Atlanta President", "years": "2007-2017", "title_id": 5579},
    # "Raphael Bostic": {"role": "Atlanta President", "years": "2017-Present", "title_id": 8996},
    # "Charles Evans": {"role": "Chicago President", "years": "2007-2023", "title_id": 8969},
    # "Austan Goolsbee": {"role": "Chicago President", "years": "2023-Present", "title_id": 8963},
    # "James Bullard": {"role": "St. Louis President", "years": "2008-2023", "title_id": 7161},
    # "Alberto Musalem": {"role": "St. Louis President", "years": "2024-Present", "title_id": 7080},
    # "Gary Stern": {"role": "Minneapolis President", "years": "1985-2009", "title_id": 1002},
    # "Narayana Kocherlakota": {"role": "Minneapolis President", "years": "2009-2015", "title_id": 9360},
    # "Thomas Hoenig": {"role": "Kansas City President", "years": "1991-2011", "title_id": 6995},
    # "Esther George": {"role": "Kansas City President", "years": "2011-2023", "title_id": 9278},
    # "Richard Fisher": {"role": "Dallas President", "years": "2005-2015", "title_id": 6147},
    # "Robert Kaplan": {"role": "Dallas President", "years": "2015-2021", "title_id": 6146},
    # "Mary Daly": {"role": "San Francisco President", "years": "2018-Present", "title_id": 9034},
}

def create_output_directory():
    """Create output directory if it doesn't exist"""
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        print(f"Created output directory: {OUTPUT_DIR}")

def get_official_filename(official_name):
    """Generate filename for an official's CSV"""
    # Clean name for filename
    clean_name = re.sub(r'[^\w\s-]', '', official_name).strip()
    clean_name = re.sub(r'[-\s]+', '_', clean_name)
    return f"{clean_name}_speeches.csv"

def csv_exists_for_official(official_name):
    """Check if CSV already exists for this official"""
    filename = get_official_filename(official_name)
    filepath = os.path.join(OUTPUT_DIR, filename)
    return os.path.exists(filepath)

def parse_date_year(date_str):
    """Parse date string and return year, or None if invalid"""
    if not date_str or date_str == 'Unknown':
        return None

    # Try to extract year from various date formats
    year_match = re.search(r'\b(19|20)\d{2}\b', str(date_str))
    if year_match:
        return int(year_match.group())
    return None

def generate_speech_id(official_name, date_str, speech_counts):
    """Generate speech ID in format: InitialLastName_YYYY_n"""
    # Extract first initial and last name
    name_parts = official_name.split()
    if len(name_parts) >= 2:
        first_initial = name_parts[0][0].upper()
        last_name = name_parts[-1]
        name_part = f"{first_initial}{last_name}"
    else:
        # Fallback if unusual name format
        name_part = official_name.replace(' ', '')

    # Extract year
    year = parse_date_year(date_str)
    if not year:
        year = "0000"
        speech_counts[year] += 1
        return f"{name_part}_{year}_{speech_counts[year]}"

    # Increment speech count for this year
    speech_counts[year] += 1

    return f"{name_part}_{year}_{speech_counts[year]}"

def is_speech_after_2000(date_str):
    """Check if speech is from 2000 or later"""
    year = parse_date_year(date_str)
    if year is None:
        return True  # Include speeches with unknown dates
    return year >= 2000

def make_request(endpoint):
    """Make API request with proper headers"""
    headers = {"X-API-Key": API_KEY}
    url = f"{BASE_URL}/{endpoint}"

    try:
        response = requests.get(url, headers=headers, timeout=30)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"    Error {response.status_code} for {endpoint}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"    Request failed: {e}")
        return None

def fix_mojibake(text):
    """Fix common UTF-8 mojibake patterns comprehensively"""
    if not text:
        return text

    # Try the re-encoding approach first for systematic fixes
    if any(indicator in text for indicator in ['√¢‚Ç¨', '√É', '\xc3\xa2', '\xc3\xa2\xc2\x80']):
        try:
            # This often fixes multiple issues at once
            fixed = text.encode('latin-1').decode('utf-8')
            text = fixed
        except (UnicodeEncodeError, UnicodeDecodeError):
            pass  # If re-encoding fails, continue with character replacement

    # Comprehensive mojibake fixes - focusing on the most common patterns
    mojibake_fixes = {
        # Your specific examples
        'you√¢‚Ç¨‚Ñ¢ll': "you'll",
        'Fed√¢‚Ç¨‚Ñ¢s': "Fed's",
        'it√¢‚Ç¨‚Ñ¢s': "it's",
        'insidious√¢‚Ç¨"a': "insidious‚Äîa",

        # Quote marks and apostrophes (most common issues)
        '√¢‚Ç¨≈ì': '"',     # left double quote
        '√¢‚Ç¨': '"',      # right double quote
        '√¢‚Ç¨‚Ñ¢': "'",     # right single quote/apostrophe
        '√¢‚Ç¨Àú': "'",     # left single quote
        '√¢‚Ç¨≈°': "'",     # single low quote
        '√¢‚Ç¨≈æ': '"',     # double low quote
        '√¢‚Ç¨¬∫': "'",     # single right angle quote
        '√¢‚Ç¨¬π': "'",     # single left angle quote

        # Dashes and hyphens
        '√¢‚Ç¨"': '‚Äî',     # em dash
        '√¢‚Ç¨"': '‚Äì',     # en dash
        '√¢‚Ç¨': '-',      # hyphen

        # Ellipsis and bullets
        '√¢‚Ç¨¬¶': '...',   # ellipsis
        '√¢‚Ç¨¬¢': '‚Ä¢',     # bullet

        # Special symbols
        '√¢‚Ç¨¬∞': '‚Ä∞',     # per mille
        '√¢‚Ç¨': '‚Ä†',      # dagger
        '√¢‚Ç¨¬°': '‚Ä°',     # double dagger

        # Spaces and formatting (causes "weird A" issues)
        '√Ç ': ' ',      # non-breaking space
        '√Ç': '',        # standalone non-breaking space marker
        '√¢‚Ç¨‚Äπ': '',       # zero-width space
        '√¢‚Ç¨≈†': ' ',      # thin space
        '√¢‚Ç¨‚Ä∞': ' ',      # thin space
        '√¢‚Ç¨‚Ç¨': ' ',      # en quad
        '√¢‚Ç¨': ' ',       # em quad

        # Common accented characters
        '√É¬°': '√°', '√É¬©': '√©', '√É¬≠': '√≠', '√É¬≥': '√≥', '√É¬∫': '√∫',
        '√É ': '√†', '√É¬®': '√®', '√É¬¨': '√¨', '√É¬≤': '√≤', '√É¬π': '√π',
        '√É¬¢': '√¢', '√É¬™': '√™', '√É¬Æ': '√Æ', '√É¬¥': '√¥', '√É¬ª': '√ª',
        '√É¬§': '√§', '√É¬´': '√´', '√É¬Ø': '√Ø', '√É¬∂': '√∂', '√É¬º': '√º',
        '√É¬•': '√•', '√É¬ß': '√ß', '√É¬±': '√±', '√É¬∏': '√∏', '√É¬ø': '√ø',
        '√É≈∏': '√ü', '√É‚Ä†': '√Ü', '√ÉÀú': '√ò', '√É‚Ä¶': '√Ö',

        # Additional byte sequence patterns
        '\xc3\xa2\xc2\x80\xc2\x99': "'",    # apostrophe
        '\xc3\xa2\xc2\x80\xc2\x9c': '"',    # left double quote
        '\xc3\xa2\xc2\x80\xc2\x9d': '"',    # right double quote
        '\xc3\xa2\xc2\x80\xc2\x94': '‚Äî',    # em dash
        '\xc3\xa2\xc2\x80\xc2\x93': '‚Äì',    # en dash
        '\xc3\xa2\xc2\x80\xc2\xa6': '...',  # ellipsis
    }

    # Apply all fixes
    for bad, good in mojibake_fixes.items():
        text = text.replace(bad, good)

    # Remove any remaining isolated '√Ç' characters
    text = re.sub(r'(?<![√Ä-√ø])√Ç(?![√Ä-√ø])', '', text)

    return text

def remove_footnotes_and_urls(text):
    """Aggressively remove footnotes, citations, URLs, and related content"""
    if not text:
        return text

    # Remove URLs first (they can interfere with other patterns)
    text = re.sub(r'https?://[^\s]+', ' ', text)
    text = re.sub(r'www\.[^\s]+', ' ', text)
    text = re.sub(r'ftp://[^\s]+', ' ', text)

    # Remove email addresses
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', ' ', text)

    # Remove footnote numbers and references
    # Pattern 1: Numbers in brackets [1], [2], etc.
    text = re.sub(r'\[\d+\]', ' ', text)

    # Pattern 2: Numbers in parentheses (1), (2), etc.
    text = re.sub(r'\(\d+\)', ' ', text)

    # Pattern 3: Superscript-style numbers at end of sentences
    text = re.sub(r'(?<=\w)\d+(?=\s|$|[.!?])', ' ', text)

    # Pattern 4: Standalone footnote numbers with periods
    text = re.sub(r'(?:^|\s)\d+\.(?=\s|$|[^\w])', ' ', text)

    # Pattern 5: Multiple footnote markers (e.g., "1,2,3")
    text = re.sub(r'(?:^|\s)\d+,\s*\d+(?:,\s*\d+)*(?=\s|$|[.!?])', ' ', text)

    # Pattern 6: Footnote markers (asterisks, daggers, etc.)
    text = re.sub(r'[\*\‚Ä†\‚Ä°\¬ß\¬∂\#]{1,3}', ' ', text)

    # Pattern 7: Numbered or lettered notes in text (e.g., "1.", "a.")
    text = re.sub(r'(?:^|\s)[0-9a-zA-Z]\.\s', ' ', text)

    # Pattern 8: Inline references like "(See Smith, 2005)"
    text = re.sub(r'\(\s*See\s+[^)]+\)', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'\(\s*[^)]+,\s*\d{4}\s*\)', ' ', text)

    # Pattern 9: Any remaining numbered list items at start of lines
    text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)

    # Remove footnote sections (common headers and their content)
    footnote_patterns = [
        r'\n\s*Footnotes?\s*:.*?(?=\n\s*(?:[A-Z]|\Z))',
        r'\n\s*Notes?\s*:.*?(?=\n\s*(?:[A-Z]|\Z))',
        r'\n\s*References?\s*:.*?(?=\n\s*(?:[A-Z]|\Z))',
        r'\n\s*Sources?\s*:.*?(?=\n\s*(?:[A-Z]|\Z))',
        r'\n\s*Endnotes?\s*:.*?(?=\n\s*(?:[A-Z]|\Z))',
        r'\n\s*Bibliography\s*:.*?(?=\n\s*(?:[A-Z]|\Z))',
        r'\n\s*Works\s+Cited\s*:.*?(?=\n\s*(?:[A-Z]|\Z))',
        r'\n\s*Appendix\s*.*?(?=\n\s*(?:[A-Z]|\Z))',
        r'\n\s*Acknowledgements?\s*:.*?(?=\n\s*(?:[A-Z]|\Z))',
        r'\n\s*See\s+Also\s*:.*?(?=\n\s*(?:[A-Z]|\Z))',
    ]

    for pattern in footnote_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)

    # Remove lines that likely contain footnote content
    lines = text.split('\n')
    cleaned_lines = []

    for line in lines:
        line = line.strip()
        # Skip lines that look like footnote content
        if re.match(r'^\d+[\.\s]', line):  # Starts with number and period/space
            continue
        if re.match(r'^[\*\‚Ä†\‚Ä°\¬ß]+', line):  # Starts with footnote symbols
            continue
        if re.match(r'^[a-zA-Z]\.\s', line):  # Starts with letter and period
            continue
        if re.match(r'^\d+\s+[A-Z]', line):  # Starts with number and capital letter
            continue
        if len(line) > 0:
            cleaned_lines.append(line)

    text = '\n'.join(cleaned_lines)

    # Final cleanup of any residual footnote-like content
    text = re.sub(r'(?:^|\s)\[\s*\]', ' ', text)  # Empty brackets
    text = re.sub(r'(?:^|\s)\(\s*\)', ' ', text)  # Empty parentheses
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace

    return text

def clean_speech_text(raw_text):
    """Enhanced text cleaning with mojibake fixes and footnote removal"""
    if not raw_text:
        return "No text available"

    text = str(raw_text)

    # Step 1: Fix mojibake issues first
    text = fix_mojibake(text)

    # Step 2: Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Step 3: Remove footnotes and URLs
    text = remove_footnotes_and_urls(text)

    # Step 4: Clean up formatting
    # Fix spacing around punctuation
    text = re.sub(r'([.!?])([A-Z])', r'\1 \2', text)  # Add space after sentence endings
    text = re.sub(r'\s+([,.!?;:])', r'\1', text)      # Remove space before punctuation
    text = re.sub(r'([,.!?;:])([A-Za-z])', r'\1 \2', text)  # Add space after punctuation

    # Handle quotes properly
    text = re.sub(r'"\s*([^"]*?)\s*"', r' "\1" ', text)
    text = re.sub(r"'\s*([^']*?)\s*'", r" '\1' ", text)

    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)

    # Clean up multiple periods
    text = re.sub(r'\.{2,}', '.', text)

    # Remove leading/trailing punctuation fragments
    text = re.sub(r'^[.,;:\-‚Äì‚Äî"\']+\s*', '', text)
    text = re.sub(r'\s*[.,;:\-‚Äì‚Äî"\']+$', '', text)

    # Step 5: Filter out very short sentences and obvious fragments
    sentences = re.split(r'(?<=[.!?])\s+', text)
    cleaned_sentences = []

    for sentence in sentences:
        sentence = sentence.strip()
        # Keep sentences that are substantial and look like real content
        if (len(sentence) > 15 and
            not re.match(r'^\d+$', sentence) and  # Not just a number
            not re.match(r'^[^\w]*$', sentence)):  # Not just punctuation
            cleaned_sentences.append(sentence)

    text = '. '.join(cleaned_sentences)

    # Final cleanup
    text = re.sub(r'\s+', ' ', text).strip()

    # Ensure proper sentence endings
    if text and not text.endswith(('.', '!', '?')):
        text += '.'

    return text

def get_text_from_url(text_url):
    """Fetch text with multiple encoding strategies and enhanced cleaning"""
    try:
        response = requests.get(text_url, timeout=30)
        if response.status_code == 200:
            raw_bytes = response.content

            # Try different encodings
            encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'windows-1252']

            for encoding in encodings_to_try:
                try:
                    decoded_text = raw_bytes.decode(encoding)
                    # Apply the enhanced cleaning
                    cleaned_text = clean_speech_text(decoded_text)

                    # Check if cleaning was successful
                    if cleaned_text and len(cleaned_text) > 100 and cleaned_text != "No text available":
                        return cleaned_text

                except (UnicodeDecodeError, UnicodeEncodeError):
                    continue

            # Final fallback
            try:
                text = raw_bytes.decode('utf-8', errors='replace')
                return clean_speech_text(text)
            except Exception as e:
                print(f"Warning: Final text decoding failed for {text_url}: {e}")
                return None

        return None
    except Exception as e:
        print(f"Request failed for {text_url}: {e}")
        return None

def is_fed_speech(item, official_name):
    """Check if item is actually a Fed official's speech"""
    # Check textUrl patterns for Fed speeches
    if 'location' in item and 'textUrl' in item['location']:
        text_urls = item['location']['textUrl']
        text_url = text_urls[0] if isinstance(text_urls, list) else text_urls

        # Look for Fed-specific directories
        fed_patterns = [
            '/presidents/', '/governors/', '/frbrich/', '/frbatl/', '/frbchi/',
            '/frbcle/', '/frbdal/', '/frbkc/', '/frbmin/', '/frbny/',
            '/frbphi/', '/frbsf/', '/frb_stl/', '/bog/'
        ]

        if any(pattern in text_url for pattern in fed_patterns):
            return True

    # Check title for Fed-related content
    title = ""
    if 'titleInfo' in item:
        title_info = item['titleInfo']
        if isinstance(title_info, list) and title_info:
            title = title_info[0].get('title', '').lower()
        elif isinstance(title_info, dict):
            title = title_info.get('title', '').lower()

    # Exclude clearly non-speech items
    exclude_terms = [
        'beige book', 'monthly labor review', 'consumer price', 'cfc_', 'mlr_',
        'index of volumes', 'technological trends', 'reports from the consuls',
        'staff report', 'expenditure policy', 'women in the economy'
    ]

    if any(term in title for term in exclude_terms):
        return False

    # Include if it mentions the official's name
    name_parts = official_name.lower().split()
    if any(part in title for part in name_parts if len(part) > 3):
        return True

    return False

def get_speeches_from_title(title_id):
    """Get all speeches from a FRASER title collection"""
    print(f"  Getting speeches from title ID {title_id}...")

    # Get all items from the title
    data = make_request(f"title/{title_id}/items?limit=1000")  # High limit to get all items

    if data and 'records' in data:
        items = data['records']
        print(f"    Found {len(items)} items in title collection")
        return items
    else:
        print(f"    No items found in title {title_id}")
        return []

def search_for_official_speeches(official_name, official_info):
    """Search for speeches by a specific official"""
    print(f" Searching for {official_name} speeches...")

    # If we have a title ID, use that first (more reliable)
    title_id = official_info.get('title_id')
    if title_id:
        title_speeches = get_speeches_from_title(title_id)
        if title_speeches:
            print(f"  Found {len(title_speeches)} speeches from title collection")
            return title_speeches

    # Fall back to search if no title ID or no speeches found
    print(f"  Falling back to search approach...")

    # Try different search variations
    search_queries = [
        official_name.replace(' ', '+'),
        official_name.split()[-1],  # Last name only
    ]

    # Add first name + last name if full name
    if len(official_name.split()) >= 2:
        first_last = f"{official_name.split()[0]}+{official_name.split()[-1]}"
        search_queries.append(first_last)

    all_speeches = []

    for query in search_queries:
        print(f"  Trying query: {query}")
        data = make_request(f"search/?q={query}&limit=100")

        if data and 'records' in data:
            records = data['records']
            print(f"    Found {len(records)} records")

            # Filter for actual Fed speeches
            speeches = []
            for record in records:
                if is_fed_speech(record, official_name):
                    speeches.append(record)

            print(f"    Filtered to {len(speeches)} Fed speeches")
            all_speeches.extend(speeches)

            time.sleep(0.5)  # Rate limiting

    # Remove duplicates based on record ID
    unique_speeches = []
    seen_ids = set()

    for speech in all_speeches:
        record_id = None
        if 'recordInfo' in speech and 'recordIdentifier' in speech['recordInfo']:
            record_id = speech['recordInfo']['recordIdentifier']
            if isinstance(record_id, list):
                record_id = record_id[0]

        if record_id and record_id not in seen_ids:
            seen_ids.add(record_id)
            unique_speeches.append(speech)
        elif not record_id:
            unique_speeches.append(speech)

    print(f"  Final count: {len(unique_speeches)} unique speeches")
    return unique_speeches

def extract_speech_data(item, official_name, official_info, speech_counts):
    """Extract speech data from an item - using FRASER's pre-extracted text"""
    title = "Untitled"
    date = "Unknown"
    text_content = ""
    source_url = ""

    # Extract title
    if 'titleInfo' in item:
        title_info = item['titleInfo']
        if isinstance(title_info, list) and title_info:
            title = title_info[0].get('title', 'Untitled')
        elif isinstance(title_info, dict):
            title = title_info.get('title', 'Untitled')

    # Extract date
    if 'originInfo' in item:
        origin = item['originInfo']
        if 'dateIssued' in origin:
            date_issued = origin['dateIssued']
            if isinstance(date_issued, list):
                if date_issued and isinstance(date_issued[0], dict):
                    date = date_issued[0].get('date', str(date_issued[0]))
                else:
                    date = str(date_issued[0]) if date_issued else 'Unknown'
            elif isinstance(date_issued, dict):
                date = date_issued.get('date', str(date_issued))
            else:
                date = str(date_issued)
        elif 'sortDate' in origin:
            date = origin['sortDate']

    # Generate speech ID
    speech_id = generate_speech_id(official_name, date, speech_counts)

    # Get text content - TRY FRASER'S PRE-EXTRACTED TEXT FIRST
    if 'location' in item:
        location = item['location']

        # Strategy 1: Try FRASER's pre-extracted text (.txt files)
        if 'textUrl' in location:
            text_urls = location['textUrl']
            text_url = text_urls[0] if isinstance(text_urls, list) else text_urls
            source_url = text_url

            if text_url:
                try:
                    response = requests.get(text_url, timeout=30)
                    if response.status_code == 200:
                        # FRASER's .txt files are relatively clean, but apply fix_mojibake just in case
                        raw_text = response.text
                        text_content = fix_mojibake(raw_text).strip()

                        # Clean up for CSV: replace line breaks with spaces to prevent CSV corruption
                        if text_content and len(text_content) > 100:
                            # Convert line breaks to spaces but preserve paragraph breaks
                            text_content = re.sub(r'\n\s*\n', ' PARAGRAPH_BREAK ', text_content)
                            text_content = re.sub(r'\n', ' ', text_content)
                            text_content = re.sub(r' PARAGRAPH_BREAK ', '\n\n', text_content)
                            # Remove excessive whitespace
                            text_content = re.sub(r'\s+', ' ', text_content).strip()

                            return {
                                'id': speech_id,
                                'date': date,
                                'speaker': official_name,
                                'role': official_info['role'],
                                'years_served': official_info['years'],
                                'title': title,
                                'text': text_content,
                                'url': source_url
                            }
                except Exception as e:
                    print(f"Warning: Failed to get/process .txt from {text_url}: {e}")
                    pass

        # Strategy 2: Try PDF if text extraction failed
        if 'pdfUrl' in location:
            pdf_urls = location['pdfUrl']
            pdf_url = pdf_urls[0] if isinstance(pdf_urls, list) else pdf_url
            source_url = pdf_url

            if pdf_url:
                try:
                    import PyPDF2
                    import io

                    response = requests.get(pdf_url, timeout=30)
                    if response.status_code == 200:
                        pdf_file = io.BytesIO(response.content)
                        pdf_reader = PyPDF2.PdfReader(pdf_file)

                        pdf_text = ""
                        for page_num in range(len(pdf_reader.pages)):
                            try:
                                page = pdf_reader.pages[page_num]
                                pdf_text += page.extract_text() + " "
                            except Exception as page_e:
                                print(f"Warning: Error extracting text from page {page_num} of {pdf_url}: {page_e}")
                                continue # Try next page

                        if pdf_text.strip() and len(pdf_text.strip()) > 100:
                            # Clean the extracted PDF text using the enhanced clean_speech_text
                            text_content = clean_speech_text(pdf_text)
                            return {
                                'id': speech_id,
                                'date': date,
                                'speaker': official_name,
                                'role': official_info['role'],
                                'years_served': official_info['years'],
                                'title': title,
                                'text': text_content,
                                'url': source_url
                            }

                except ImportError:
                    print("Warning: PyPDF2 not installed. Cannot extract from PDF.")
                    pass  # PyPDF2 not available
                except Exception as e:
                    print(f"Warning: Failed to get/process PDF from {pdf_url}: {e}")
                    pass

        # Strategy 3: Fallback to HTML method (using the enhanced cleaner)
        if not text_content or len(text_content) < 100:
            if 'textUrl' in location:
                text_urls = location['textUrl']
                text_url = text_urls[0] if isinstance(text_urls, list) else text_urls
                source_url = text_url

                if text_url:
                    print(f"  Attempting HTML fallback for {text_url}...")
                    raw_content = get_text_from_url(text_url) # This already calls fix_mojibake
                    if raw_content:
                        # Clean the raw HTML text using the enhanced clean_speech_text
                        text_content = clean_speech_text(raw_content)

                        if text_content and len(text_content) > 100:
                            return {
                                'id': speech_id,
                                'date': date,
                                'speaker': official_name,
                                'role': official_info['role'],
                                'years_served': official_info['years'],
                                'title': title,
                                'text': text_content,
                                'url': source_url
                            }
                        else:
                            print(f"Warning: HTML fallback text too short or empty after cleaning for {text_url}")
                    else:
                        print(f"Warning: HTML fallback failed to get content from {text_url}")

    # Final fallback if no text was successfully extracted and cleaned
    if not text_content or len(text_content) < 100:
        text_content = "No text available"
        # If we reach here and have an item with a title, log it
        if 'titleInfo' in item:
            print(f"Warning: Could not extract sufficient text for speech: {title} ({date})")

    # Ensure text_content is cleaned before returning
    final_cleaned_text = clean_speech_text(text_content)

    return {
        'id': speech_id,
        'date': date,
        'speaker': official_name,
        'role': official_info['role'],
        'years_served': official_info['years'],
        'title': title,
        'text': final_cleaned_text,
        'url': source_url
    }

def save_official_speeches(official_name, speeches):
    """Save speeches for a single official to their own CSV file"""
    filename = get_official_filename(official_name)
    filepath = os.path.join(OUTPUT_DIR, filename)

    if not speeches:
        print(f"  No speeches to save for {official_name}")
        return

    # Ensure date is sortable if possible
    def sort_key(x):
        try:
            # Attempt to parse the date for sorting
            return datetime.strptime(str(x.get('date', 'Unknown')), '%Y-%m-%d %H:%M:%S')
        except ValueError:
            try:
                return datetime.strptime(str(x.get('date', 'Unknown')), '%Y-%m-%d')
            except ValueError:
                return datetime.min # Put unknown dates at the beginning

    speeches.sort(key=sort_key, reverse=True)
    print(f"   Saving {len(speeches)} speeches to {filename}...")

    # Explicitly specify UTF-8 encoding with proper CSV quoting
    with open(filepath, 'w', newline='', encoding='utf-8') as f:
        fieldnames = ['id', 'date', 'speaker', 'role', 'years_served', 'title', 'text', 'url']
        writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
        writer.writeheader()

        # Clean the text data before writing to prevent CSV corruption
        for speech in speeches:
            cleaned_speech = speech.copy()
            # Replace problematic characters that could break CSV structure
            if 'text' in cleaned_speech and cleaned_speech['text']:
                cleaned_speech['text'] = cleaned_speech['text'].replace('\n', ' ').replace('\r', ' ')
            writer.writerow(cleaned_speech)

    print(f"   Successfully saved {len(speeches)} speeches to {filename}")

def main():
    print("Federal Reserve Officials Speech Scraper")
    print("=" * 60)

    # Create output directory
    create_output_directory()

    print(f"Output directory: {OUTPUT_DIR}")
    print(f"Only scraping missing officials: {ONLY_SCRAPE_MISSING}")
    print(f"Searching for speeches from {len(OFFICIALS)} Fed officials...")

    total_speeches = 0
    official_counts = {}

    for official_name, official_info in OFFICIALS.items():
        print(f"\n{'='*50}")
        print(f"Processing: {official_name} ({official_info['role']}, {official_info['years']})")

        # Check if we should skip this official
        if ONLY_SCRAPE_MISSING and csv_exists_for_official(official_name):
            print(f"  ‚è≠  Skipping {official_name} - CSV already exists")
            continue

        # Search for this official's speeches
        speech_items = search_for_official_speeches(official_name, official_info)

        if not speech_items:
            print(f"  No speeches found for {official_name}")
            official_counts[official_name] = 0
            continue

        # Initialize speech counts per year for this official
        speech_counts = defaultdict(int)

        # Process each speech
        official_speeches = []
        for i, item in enumerate(speech_items, 1):
            print(f"    Processing speech {i}/{len(speech_items)}...")

            speech = extract_speech_data(item, official_name, official_info, speech_counts)

            # Only add speeches with substantial content AND from 2000 or later
            text_length = len(speech['text'])
            date_ok = is_speech_after_2000(speech['date'])

            # Check if the cleaned text is still "No text available" or too short after cleaning
            if speech['text'] == "No text available" or text_length < 300:
                print(f"      Skipped: Text too short or unavailable after cleaning ({text_length} chars) - '{speech['title'][:50]}...'")
                continue # Skip if text is not substantial after processing

            if date_ok:
                official_speeches.append(speech)
            else:
                # Show why speech was filtered out based on date
                print(f"      Skipped: Date {speech['date']} before 2000 - '{speech['title'][:50]}...'")

            time.sleep(0.1)  # Be nice to API

        print(f"   Collected {len(official_speeches)} speeches for {official_name}")

        # Save this official's speeches to separate CSV
        save_official_speeches(official_name, official_speeches)

        total_speeches += len(official_speeches)
        official_counts[official_name] = len(official_speeches)

        # Longer pause between officials
        time.sleep(1)

    # Summary by official
    print(f"\nüìà COLLECTION SUMMARY:")
    print(f"Total speeches collected: {total_speeches:,}")
    print(f"Officials with speeches found: {sum(1 for count in official_counts.values() if count > 0)}/{len(OFFICIALS)}")

    print(f"\nTop 10 Officials by Speech Count:")
    sorted_officials = sorted(official_counts.items(), key=lambda x: x[1], reverse=True)
    for i, (name, count) in enumerate(sorted_officials[:10], 1):
        if count > 0:
            print(f"{i:2d}. {name}: {count} speeches")

    print(f"\n All results saved to individual CSV files in: {OUTPUT_DIR}")

    main()

Federal Reserve Officials Speech Scraper
Output directory: /content/drive/MyDrive/Speeches
Only scraping missing officials: True
Searching for speeches from 27 Fed officials...

Processing: Alan Greenspan (Chair, 1987-2006)
  ‚è≠  Skipping Alan Greenspan - CSV already exists

Processing: Susan Bies (Governor, 2001-2007)
  ‚è≠  Skipping Susan Bies - CSV already exists

Processing: Mark Olson (Governor, 2001-2006)
  ‚è≠  Skipping Mark Olson - CSV already exists

Processing: Edward Gramlich (Governor, 1997-2005)
  ‚è≠  Skipping Edward Gramlich - CSV already exists

Processing: Roger Ferguson (Governor/Vice Chair, 1997-2006)
  ‚è≠  Skipping Roger Ferguson - CSV already exists

Processing: Laurence Meyer (Governor, 1996-2002)
  ‚è≠  Skipping Laurence Meyer - CSV already exists

Processing: Donald Kohn (Governor/Vice Chair, 2002-2010)
  ‚è≠  Skipping Donald Kohn - CSV already exists

Processing: Ben Bernanke (Chair, 2006-2014)
  ‚è≠  Skipping Ben Bernanke - CSV already exists

Processing: Ja

In [None]:
# @title  CREATE LABOR MARKET INDICATOR EMPHASIS VECTORS
import os
import pandas as pd
import re
import numpy as np
import glob
from collections import Counter

# Define the labor market indicators and their related keywords
INDICATORS = {
    "Employment": ["employed", "employment", "employment-to-population", "employment to population", "employment/population", "e/p ratio", "employment-population ratio"],
    "Unemployment": ["unemployment", "unemployed", "jobless", "joblessness", "u-3", "u3 rate"],
    "Participation": ["participation", "marginally attached", "discouraged workers", "retirement", "labor force", "want a job", "labor force participation", "participation rate", "lfpr", "labor market participation"],
    "Wages": ["pay", "eci", "labor cost", "labor costs", "wage", "wages", "wage growth", "wage inflation", "compensation", "earnings", "salary", "salaries"],
    "Vacancies": ["vacancy", "vacancies", "job opening", "job openings", "jolts", "unfilled position", "open position", "vacant job", "vacancy rate", "job vacancy", "posted opening", "help wanted", "available job", "job posting", "position opening", "vacancy yield", "beveridge curve", "job advertisement", "job availability", "hiring difficulty"],
    "Quits": ["quit", "quits", "resignation", "resignations", "voluntary separation", "turnover", "voluntary turnover", "job changing", "job switching", "job hopping", "job-to-job"],
    "Layoffs": ["layoff", "layoffs", "involuntary separation", "dismissal", "dismissals", "discharged", "job losses" "discharge", "downsizing", "redundancy", "workforce reduction", "job cuts", "reduction in force", "rif", "termination", "fired", "job loss", "permanent layoff", "temporary layoff", "furlough", "mass layoff", "involuntary job loss", "job destruction", "establishment closure", "job shedding"],
    "Hiring": ["hiring", "job finding", "payroll", "nonfarm payroll", "job gain", "job gains", "job growth", "employment gain", "employment growth", "job creation"]
}

# Configuration - Update this path to match your speech CSV location
SPEECHES_DIR = '/content/drive/MyDrive/Speeches'
OUTPUT_DIR = '/content/drive/MyDrive/Speeches/Analysis'

def create_output_directory():
    """Create output directory if it doesn't exist"""
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        print(f"Created output directory: {OUTPUT_DIR}")

def get_labor_market_keywords():
    """Create a list of all labor market related keywords"""
    labor_keywords = ["labor market", "job market", "employment", "unemployment", "labor",
                    "labor markets", "job markets", "hiring", "workers", "unemployment",
                    "employment situation", "labor force", "job", "jobs"]

    # Add all indicator keywords to the labor market keywords
    for keywords in INDICATORS.values():
        labor_keywords.extend(keywords)

    return labor_keywords

def split_into_sentences(text):
    """Split text into sentences using regex"""
    # First replace common abbreviations to avoid splitting them incorrectly
    text = re.sub(r'(\b[A-Z])\. ', r'\1@@ ', text)  # Handle abbreviations like U.S.
    text = re.sub(r'(\b[A-Z][a-z])\. ', r'\1@@ ', text)  # Handle Dr., Mr., etc.

    # Split by sentence boundaries
    sentences = re.split(r'(?<=[.!?])\s+', text)

    # Restore the periods in abbreviations
    sentences = [re.sub(r'@@', '.', s) for s in sentences]

    return sentences

def get_indicator_counts(text):
    """Get counts of each indicator's keywords in the text"""
    text_lower = text.lower()

    indicator_counts = {indicator: 0 for indicator in INDICATORS}

    for indicator, keywords in INDICATORS.items():
        for keyword in keywords:
            # Use word boundaries to avoid partial matches
            matches = re.findall(r'\b' + re.escape(keyword.lower()) + r'\b', text_lower)
            count = len(matches)

            # Special handling for "employment" - exclude "maximum employment"
            if keyword.lower() == "employment":
                # Count how many times "maximum employment" appears and subtract from employment count
                max_employment_count = len(re.findall(r'\bmaximum employment\b', text_lower))
                count = max(0, count - max_employment_count)

            indicator_counts[indicator] += count

    return indicator_counts

def contains_labor_market_keyword(sentence, labor_keywords):
    """Check if a sentence contains any labor market related keywords"""
    sentence_lower = sentence.lower()

    for keyword in labor_keywords:
        if re.search(r'\b' + re.escape(keyword.lower()) + r'\b', sentence_lower):
            return True

    return False

def analyze_fed_speeches():
    """Analyze the Fed speeches for labor market content"""
    # Get the list of all CSV files in the speeches folder
    speech_files = glob.glob(os.path.join(SPEECHES_DIR, "*_speeches.csv"))

    if not speech_files:
        print(f"No speech files found in '{SPEECHES_DIR}' folder.")
        print("Make sure the Fed speech scraper has been run and CSV files exist.")
        return None

    # Get the list of labor market keywords
    labor_keywords = get_labor_market_keywords()

    # List to store all results
    all_results = []

    # Process each official's speech file
    for speech_file in speech_files:
        # Extract official name from filename
        filename = os.path.basename(speech_file)
        official_name = filename.replace('_speeches.csv', '').replace('_', ' ')
        print(f"Processing speeches for {official_name}")

        try:
            # Load the speech data
            official_df = pd.read_csv(speech_file)

            # Skip if no speeches in file
            if official_df.empty:
                print(f"  No speeches found for {official_name}")
                continue

            # Process each speech entry
            for _, row in official_df.iterrows():
                date = row['date']
                title = row['title']
                text = row['text']
                role = row['role']
                years_served = row['years_served']

                # Skip if text is missing or too short
                if pd.isna(text) or text == "" or text == "No text available":
                    continue

                # Skip very short texts (likely not real speeches)
                if len(text) < 300:
                    continue

                # Split the text into sentences
                sentences = split_into_sentences(text)

                # Identify labor market related sentences
                labor_sentences = []
                for sentence in sentences:
                    if contains_labor_market_keyword(sentence, labor_keywords):
                        labor_sentences.append(sentence)

                # Calculate percentage of speech focused on labor markets
                total_sentences = len(sentences)
                labor_sentences_count = len(labor_sentences)

                if total_sentences > 0:
                    labor_market_percentage = (labor_sentences_count / total_sentences) * 100
                else:
                    labor_market_percentage = 0

                # Combine all labor market sentences
                labor_market_text = " ".join(labor_sentences)

                # Get indicator counts for the labor market portion
                indicator_counts = get_indicator_counts(labor_market_text)

                # Calculate percentages for each indicator (reweighted to sum to 100%)
                total_indicator_mentions = sum(indicator_counts.values())

                indicator_percentages = {}
                if total_indicator_mentions > 0:
                    for indicator, count in indicator_counts.items():
                        indicator_percentages[indicator] = (count / total_indicator_mentions) * 100
                else:
                    indicator_percentages = {indicator: 0 for indicator in INDICATORS}

                # Create result record
                result = {
                    'Official_Name': official_name,
                    'Date': date,
                    'Title': title,
                    'Role': role,
                    'Years_Served': years_served,
                    'Labor_Market_Percentage': labor_market_percentage,
                    'Labor_Sentences': labor_sentences_count,
                    'Total_Sentences': total_sentences,
                    'Speech_Length_Chars': len(text)
                }

                # Add indicator percentages
                for indicator, percentage in indicator_percentages.items():
                    result[f'{indicator}_Percentage'] = percentage

                all_results.append(result)

        except Exception as e:
            print(f"Error processing {speech_file}: {e}")

    # Convert to DataFrame
    if all_results:
        results_df = pd.DataFrame(all_results)

        # Convert date to datetime (handle various date formats)
        try:
            results_df['Date'] = pd.to_datetime(results_df['Date'], errors='coerce')
        except:
            print("Warning: Some dates could not be parsed")

        # Sort by official name and date
        results_df = results_df.sort_values(['Official_Name', 'Date'])

        # Add verification column that percentages sum to 100%
        indicator_columns = [f'{indicator}_Percentage' for indicator in INDICATORS]
        results_df['Sum_Indicator_Percentages'] = results_df[indicator_columns].sum(axis=1)

        # Create output directory
        create_output_directory()

        # Save to CSV
        output_file = os.path.join(OUTPUT_DIR, 'fed_labor_market_analysis.csv')
        results_df.to_csv(output_file, index=False)
        print(f"Analysis completed. {len(results_df)} speech entries analyzed and saved to {output_file}")

        return results_df
    else:
        print("No results generated.")
        return None

def generate_fed_summaries(results_df):
    """Generate summary statistics for each Fed official"""
    if results_df is None or results_df.empty:
        print("No results to summarize.")
        return

    # Group by official name
    grouped = results_df.groupby('Official_Name')

    # Create summary dataframe
    summaries = []

    for official_name, group in grouped:
        # Calculate averages
        avg_labor_percentage = group['Labor_Market_Percentage'].mean()
        total_speeches = len(group)
        labor_focused_speeches = sum(group['Labor_Market_Percentage'] > 0)

        # Get role and years served (should be consistent for each official)
        role = group['Role'].iloc[0] if not group['Role'].empty else "Unknown"
        years_served = group['Years_Served'].iloc[0] if not group['Years_Served'].empty else "Unknown"

        # Calculate date range of speeches
        valid_dates = group['Date'].dropna()
        if not valid_dates.empty:
            date_range = f"{valid_dates.min().strftime('%Y-%m-%d')} to {valid_dates.max().strftime('%Y-%m-%d')}"
        else:
            date_range = "Unknown"

        # Calculate average speech length
        avg_speech_length = group['Speech_Length_Chars'].mean()

        # Calculate average indicator percentages
        indicator_averages = {}
        for indicator in INDICATORS:
            col = f'{indicator}_Percentage'
            # Only consider speeches where labor market was discussed
            labor_speeches = group[group['Labor_Market_Percentage'] > 0]
            if not labor_speeches.empty:
                indicator_averages[indicator] = labor_speeches[col].mean()
            else:
                indicator_averages[indicator] = 0

        # Create summary record
        summary = {
            'Official_Name': official_name,
            'Role': role,
            'Years_Served': years_served,
            'Date_Range': date_range,
            'Total_Speeches': total_speeches,
            'Labor_Focused_Speeches': labor_focused_speeches,
            'Labor_Focus_Rate': labor_focused_speeches / total_speeches * 100 if total_speeches > 0 else 0,
            'Avg_Labor_Market_Percentage': avg_labor_percentage,
            'Avg_Speech_Length_Chars': avg_speech_length
        }

        # Add indicator averages
        for indicator, avg in indicator_averages.items():
            summary[f'Avg_{indicator}_Percentage'] = avg

        summaries.append(summary)

    # Convert to DataFrame
    summaries_df = pd.DataFrame(summaries)

    # Sort by average labor market percentage (descending)
    summaries_df = summaries_df.sort_values('Avg_Labor_Market_Percentage', ascending=False)

    # Save to CSV
    output_file = os.path.join(OUTPUT_DIR, 'fed_labor_market_summaries.csv')
    summaries_df.to_csv(output_file, index=False)
    print(f"Generated summaries for {len(summaries_df)} officials saved to {output_file}")

    return summaries_df

def filter_labor_market_speeches(results_df, min_percentage=0):
    """Filter to only include speech entries that discuss labor markets"""
    if results_df is None or results_df.empty:
        print("No results to filter.")
        return None

    # Filter for entries that have labor market content
    print(f"\nFiltering speeches with labor market percentage > {min_percentage}%...")

    original_count = len(results_df)
    filtered_df = results_df[results_df['Labor_Market_Percentage'] > min_percentage].copy()
    filtered_count = len(filtered_df)

    print(f"Before filtering: {original_count} speeches")
    print(f"After filtering: {filtered_count} speeches ({filtered_count/original_count*100:.1f}%)")

    # Reset index
    filtered_df = filtered_df.reset_index(drop=True)

    # Save the filtered results
    output_file = os.path.join(OUTPUT_DIR, 'fed_labor_market_speeches_only.csv')
    filtered_df.to_csv(output_file, index=False)
    print(f"Filtered results saved to {output_file}")

    return filtered_df


def main():
    """Main function to run the Fed speech analysis"""
    print("Analyzing Fed officials' speeches for labor market content...")
    print(f"Looking for speech files in: {SPEECHES_DIR}")

    # Run the analysis
    results_df = analyze_fed_speeches()

    if results_df is not None:
        # Generate official summaries
        summaries_df = generate_fed_summaries(results_df)

        # Filter to only include speeches with labor market content
        filtered_df = filter_labor_market_speeches(results_df, min_percentage=0)

        # Print overall statistics
        print("\n" + "="*60)
        print("OVERALL STATISTICS")
        print("="*60)
        print(f"Total speech entries analyzed: {len(results_df):,}")

        labor_speeches = results_df[results_df['Labor_Market_Percentage'] > 0]
        print(f"Speeches with labor market content: {len(labor_speeches):,} ({len(labor_speeches)/len(results_df)*100:.1f}%)")

        print(f"Average percentage of speech about labor markets: {results_df['Labor_Market_Percentage'].mean():.2f}%")

        # Top officials by labor market focus
        if summaries_df is not None:
            print(f"\nTop 5 Officials by Labor Market Focus:")
            top_officials = summaries_df.head(5)
            for _, row in top_officials.iterrows():
                print(f"  {row['Official_Name']}: {row['Avg_Labor_Market_Percentage']:.1f}% (Role: {row['Role']})")

        # Calculate average percentages for each indicator (only for speeches with labor content)
        if not labor_speeches.empty:
            print(f"\nAverage Indicator Percentages (for speeches with labor market content):")
            for indicator in INDICATORS:
                col = f'{indicator}_Percentage'
                avg_percentage = labor_speeches[col].mean()
                print(f"  {indicator}: {avg_percentage:.1f}%")

        print(f"\nAll analysis files saved to: {OUTPUT_DIR}")

    else:
        print("Analysis failed. Please check that speech CSV files exist in the specified directory.")

if __name__ == "__main__":
    main()

Analyzing Fed officials' speeches for labor market content...
Looking for speech files in: /content/drive/MyDrive/Speeches
No speech files found in '/content/drive/MyDrive/Speeches' folder.
Make sure the Fed speech scraper has been run and CSV files exist.
Analysis failed. Please check that speech CSV files exist in the specified directory.


In [None]:
# @title Download speeches .pdf and create spreadsheet of links
#!/usr/bin/env python3
"""
Federal Reserve Officials Speech Links Scraper
Collects speech metadata and links from Fed Governors and Regional Presidents
Creates a CSV with links instead of full text content
"""

import requests
import csv
import json
import time
import re
import os
from datetime import datetime
from urllib.parse import urlparse

# Configuration
API_KEY = "6ebaa277c3f1d751e899c615816470a9"
BASE_URL = "https://fraser.stlouisfed.org/api"
OUTPUT_DIR = '/content/drive/MyDrive/Speeches'
OUTPUT_FILE = 'speechlinks.csv'
PDF_DIR = 'pdfs'  # Subdirectory for PDF files

# Download settings
DOWNLOAD_PDFS = True  # Set to False to skip PDF downloads
MAX_RETRIES = 3
DOWNLOAD_DELAY = 0.5  # Seconds between downloads

# Option to only scrape missing officials (set to True to only scrape missing CSVs)
ONLY_SCRAPE_MISSING = False

# Federal Reserve Officials - with FRASER title IDs where available
OFFICIALS = {
    # Board of Governors
    "Alan Greenspan": {"role": "Chair", "years": "1987-2006", "title_id": 452},
    "Susan Bies": {"role": "Governor", "years": "2001-2007", "title_id": 955},
    "Mark Olson": {"role": "Governor", "years": "2001-2006", "title_id": 941},
    "Edward Gramlich": {"role": "Governor", "years": "1997-2005", "title_id": 914},
    "Roger Ferguson": {"role": "Governor/Vice Chair", "years": "1997-2006", "title_id": 950},
    "Laurence Meyer": {"role": "Governor", "years": "1996-2002", "title_id": 936},
    "Donald Kohn": {"role": "Governor/Vice Chair", "years": "2002-2010", "title_id": 464},
    "Ben Bernanke": {"role": "Chair", "years": "2006-2014", "title_id": 453},
    "Janet Yellen": {"role": "Governor/Vice Chair/Chair", "years": "2004-2018", "title_id": 930},
    "Jerome Powell": {"role": "Governor/Chair", "years": "2012-Present", "title_id": 1164},
    "Randall Kroszner": {"role": "Governor", "years": "2006-2009", "title_id": 948},
    "Kevin Warsh": {"role": "Governor", "years": "2006-2011", "title_id": 935},
    "Frederic Mishkin": {"role": "Governor", "years": "2006-2008", "title_id": 919},
    "Elizabeth Duke": {"role": "Governor", "years": "2008-2013", "title_id": 916},
    "Daniel Tarullo": {"role": "Governor", "years": "2009-2017", "title_id": 910},
    "Sarah Bloom Raskin": {"role": "Governor", "years": "2010-2014", "title_id": 951},
    "Lael Brainard": {"role": "Governor/Vice Chair", "years": "2014-2023", "title_id": 3777},
    "Stanley Fischer": {"role": "Vice Chair", "years": "2014-2017", "title_id": 3778},
    "Jeremy Stein": {"role": "Governor", "years": "2012-2014", "title_id": 1163},
    "Michelle Bowman": {"role": "Governor", "years": "2018-Present", "title_id": 6098},
    "Richard Clarida": {"role": "Vice Chair", "years": "2018-2022", "title_id": 5997},
    "Randal Quarles": {"role": "Governor/Vice Chair for Supervision", "years": "2017-2021", "title_id": 5732},
    "Christopher Waller": {"role": "Governor", "years": "2020-Present", "title_id": 6421},
    "Michael Barr": {"role": "Governor/Vice Chair for Supervision", "years": "2022-Present", "title_id": 6862},
    "Lisa Cook": {"role": "Governor", "years": "2022-Present", "title_id": 6861},
    "Adriana Kugler": {"role": "Governor", "years": "2023-Present", "title_id": 9290},
    "Philip Jefferson": {"role": "Governor/Vice Chair", "years": "2022-Present", "title_id": 6860},

    # Regional Presidents
    "William McDonough": {"role": "New York President", "years": "1993-2003", "title_id": 6748},
    "Edward G. Boehne": {"role": "Philadelphia President", "years": "1981-2000", "title_id": 6108},
    "Jerry Jordan": {"role": "Cleveland President", "years": "1992-2003", "title_id": 3769},
    "Alfred Broaddus": {"role": "Richmond President", "years": "1993-2004", "title_id": 9267},
    "Michael Moskow": {"role": "Chicago President", "years": "1994-2007", "title_id": 5967},
    "William Poole": {"role": "St. Louis President", "years": "1998-2008", "title_id": 485},
    "Neel Kashkari": {"role": "Minneapolis President", "years": "2016-Present", "title_id": 9361},
    "Robert McTeer": {"role": "Dallas President", "years": "1991-2004", "title_id": 6144},
    "Cathy Minehan": {"role": "Boston President", "years": "1994-2007", "title_id": 9017},
    "Anthony Santomero": {"role": "Philadelphia President", "years": "2000-2006", "title_id": 6109},
    "Jack Guynn": {"role": "Atlanta President", "years": "1996-2007", "title_id": 5170},
    "Robert Parry": {"role": "San Francisco President", "years": "1986-2004", "title_id": 1270},
    "Eric Rosengren": {"role": "Boston President", "years": "2007-2021", "title_id": 9015},
    "Susan Collins": {"role": "Boston President", "years": "2022-Present", "title_id": 9016},
    "Timothy Geithner": {"role": "New York President", "years": "2003-2009", "title_id": 6750},
    "William Dudley": {"role": "New York President", "years": "2009-2018", "title_id": 6749},
    "John Williams": {"role": "New York President", "years": "2011-Present", "title_id": 9040},
    "Charles Plosser": {"role": "Philadelphia President", "years": "2006-2015", "title_id": 6101},
    "Patrick Harker": {"role": "Philadelphia President", "years": "2015-Present", "title_id": 6102},
    "Sandra Pianalto": {"role": "Cleveland President", "years": "2003-2014", "title_id": 3770},
    "Loretta Mester": {"role": "Cleveland President", "years": "2014-2024", "title_id": 9033},
    "Jeffrey Lacker": {"role": "Richmond President", "years": "2004-2017", "title_id": 6827},
    "Thomas Barkin": {"role": "Richmond President", "years": "2018-Present", "title_id": 9266},
    "Dennis Lockhart": {"role": "Atlanta President", "years": "2007-2017", "title_id": 5579},
    "Raphael Bostic": {"role": "Atlanta President", "years": "2017-Present", "title_id": 8996},
    "Charles Evans": {"role": "Chicago President", "years": "2007-2023", "title_id": 8969},
    "Austan Goolsbee": {"role": "Chicago President", "years": "2023-Present", "title_id": 8963},
    "James Bullard": {"role": "St. Louis President", "years": "2008-2023", "title_id": 7161},
    "Alberto Musalem": {"role": "St. Louis President", "years": "2024-Present", "title_id": 7080},
    "Gary Stern": {"role": "Minneapolis President", "years": "1985-2009", "title_id": 1002},
    "Narayana Kocherlakota": {"role": "Minneapolis President", "years": "2009-2015", "title_id": 9360},
    "Thomas Hoenig": {"role": "Kansas City President", "years": "1991-2011", "title_id": 6995},
    "Esther George": {"role": "Kansas City President", "years": "2011-2023", "title_id": 9278},
    "Richard Fisher": {"role": "Dallas President", "years": "2005-2015", "title_id": 6147},
    "Robert Kaplan": {"role": "Dallas President", "years": "2015-2021", "title_id": 6146},
    "Mary Daly": {"role": "San Francisco President", "years": "2018-Present", "title_id": 9034},
}

def create_output_directory():
    """Create output directory and PDF subdirectory if they don't exist"""
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        print(f"Created output directory: {OUTPUT_DIR}")

    pdf_path = os.path.join(OUTPUT_DIR, PDF_DIR)
    if not os.path.exists(pdf_path):
        os.makedirs(pdf_path)
        print(f"Created PDF directory: {pdf_path}")

    return pdf_path

def parse_date_year(date_str):
    """Parse date string and return year, or None if invalid"""
    if not date_str or date_str == 'Unknown':
        return None

    # Try to extract year from various date formats
    year_match = re.search(r'\b(19|20)\d{2}\b', str(date_str))
    if year_match:
        return int(year_match.group())
    return None

def generate_speech_id(official_name, date_str, speech_counter=None):
    """Generate speech ID in format: FirstInitialLastName_YYYYMMDD_speech or with counter"""
    # Extract first initial and last name
    name_parts = official_name.split()
    if len(name_parts) >= 2:
        first_initial = name_parts[0][0].upper()
        last_name = name_parts[-1]
        name_part = f"{first_initial}{last_name}"
    else:
        # Fallback if unusual name format
        name_part = official_name.replace(' ', '')

    # Extract date in YYYYMMDD format
    year = parse_date_year(date_str)
    if year:
        # Try to extract full date if possible
        date_match = re.search(r'(\d{4})-(\d{2})-(\d{2})', str(date_str))
        if date_match:
            date_part = f"{date_match.group(1)}{date_match.group(2)}{date_match.group(3)}"
        else:
            # If we only have year, try to extract month/day from the string
            month_day_match = re.search(r'(\d{1,2})[/-](\d{1,2})[/-](\d{4})', str(date_str))
            if month_day_match:
                month = month_day_match.group(1).zfill(2)
                day = month_day_match.group(2).zfill(2)
                date_part = f"{year}{month}{day}"
            else:
                # Try to find month names
                month_names = {
                    'january': '01', 'february': '02', 'march': '03', 'april': '04',
                    'may': '05', 'june': '06', 'july': '07', 'august': '08',
                    'september': '09', 'october': '10', 'november': '11', 'december': '12',
                    'jan': '01', 'feb': '02', 'mar': '03', 'apr': '04',
                    'jun': '06', 'jul': '07', 'aug': '08', 'sep': '09',
                    'oct': '10', 'nov': '11', 'dec': '12'
                }

                month_found = None
                day_found = None
                date_lower = str(date_str).lower()

                for month_name, month_num in month_names.items():
                    if month_name in date_lower:
                        month_found = month_num
                        # Try to find day number near the month
                        day_match = re.search(rf'{month_name}\s*(\d{{1,2}})', date_lower)
                        if day_match:
                            day_found = day_match.group(1).zfill(2)
                        break

                if month_found:
                    day_part = day_found if day_found else '01'
                    date_part = f"{year}{month_found}{day_part}"
                else:
                    # Use counter to make unique if only year available
                    counter_part = f"{speech_counter:03d}" if speech_counter else "001"
                    date_part = f"{year}01{counter_part[-2:]}"  # Use last 2 digits as day
    else:
        counter_part = f"{speech_counter:03d}" if speech_counter else "001"
        date_part = f"0000{counter_part[-4:]}"  # Unknown date fallback with counter

    return f"{name_part}_{date_part}_speech"

def is_speech_after_2000(date_str):
    """Check if speech is from 2000 or later"""
    year = parse_date_year(date_str)
    if year is None:
        return True  # Include speeches with unknown dates
    return year >= 2000

def make_request(endpoint):
    """Make API request with proper headers"""
    headers = {"X-API-Key": API_KEY}
    url = f"{BASE_URL}/{endpoint}"

    try:
        response = requests.get(url, headers=headers, timeout=30)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"    Error {response.status_code} for {endpoint}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"    Request failed: {e}")
        return None

def is_fed_speech(item, official_name):
    """Check if item is actually a Fed official's speech"""
    # Check textUrl patterns for Fed speeches
    if 'location' in item and 'textUrl' in item['location']:
        text_urls = item['location']['textUrl']
        text_url = text_urls[0] if isinstance(text_urls, list) else text_urls

        # Look for Fed-specific directories
        fed_patterns = [
            '/presidents/', '/governors/', '/frbrich/', '/frbatl/', '/frbchi/',
            '/frbcle/', '/frbdal/', '/frbkc/', '/frbmin/', '/frbny/',
            '/frbphi/', '/frbsf/', '/frb_stl/', '/bog/'
        ]

        if any(pattern in text_url for pattern in fed_patterns):
            return True

    # Check title for Fed-related content
    title = ""
    if 'titleInfo' in item:
        title_info = item['titleInfo']
        if isinstance(title_info, list) and title_info:
            title = title_info[0].get('title', '').lower()
        elif isinstance(title_info, dict):
            title = title_info.get('title', '').lower()

    # Exclude clearly non-speech items
    exclude_terms = [
        'beige book', 'monthly labor review', 'consumer price', 'cfc_', 'mlr_',
        'index of volumes', 'technological trends', 'reports from the consuls',
        'staff report', 'expenditure policy', 'women in the economy'
    ]

    if any(term in title for term in exclude_terms):
        return False

    # Include if it mentions the official's name
    name_parts = official_name.lower().split()
    if any(part in title for part in name_parts if len(part) > 3):
        return True

    return False

def get_speeches_from_title(title_id):
    """Get all speeches from a FRASER title collection"""
    print(f"  Getting speeches from title ID {title_id}...")

    # Get all items from the title
    data = make_request(f"title/{title_id}/items?limit=1000")  # High limit to get all items

    if data and 'records' in data:
        items = data['records']
        print(f"    Found {len(items)} items in title collection")
        return items
    else:
        print(f"    No items found in title {title_id}")
        return []

def search_for_official_speeches(official_name, official_info):
    """Search for speeches by a specific official"""
    print(f" Searching for {official_name} speeches...")

    # If we have a title ID, use that first (more reliable)
    title_id = official_info.get('title_id')
    if title_id:
        title_speeches = get_speeches_from_title(title_id)
        if title_speeches:
            print(f"  Found {len(title_speeches)} speeches from title collection")
            return title_speeches

    # Fall back to search if no title ID or no speeches found
    print(f"  Falling back to search approach...")

    # Try different search variations
    search_queries = [
        official_name.replace(' ', '+'),
        official_name.split()[-1],  # Last name only
    ]

    # Add first name + last name if full name
    if len(official_name.split()) >= 2:
        first_last = f"{official_name.split()[0]}+{official_name.split()[-1]}"
        search_queries.append(first_last)

    all_speeches = []

    for query in search_queries:
        print(f"  Trying query: {query}")
        data = make_request(f"search/?q={query}&limit=100")

        if data and 'records' in data:
            records = data['records']
            print(f"    Found {len(records)} records")

            # Filter for actual Fed speeches
            speeches = []
            for record in records:
                if is_fed_speech(record, official_name):
                    speeches.append(record)

            print(f"    Filtered to {len(speeches)} Fed speeches")
            all_speeches.extend(speeches)

            time.sleep(0.5)  # Rate limiting

    # Remove duplicates based on record ID
    unique_speeches = []
    seen_ids = set()

    for speech in all_speeches:
        record_id = None
        if 'recordInfo' in speech and 'recordIdentifier' in speech['recordInfo']:
            record_id = speech['recordInfo']['recordIdentifier']
            if isinstance(record_id, list):
                record_id = record_id[0]

        if record_id and record_id not in seen_ids:
            seen_ids.add(record_id)
            unique_speeches.append(speech)
        elif not record_id:
            unique_speeches.append(speech)

    print(f"  Final count: {len(unique_speeches)} unique speeches")
    return unique_speeches

def download_pdf(pdf_url, speech_id, pdf_directory):
    """Download PDF file with retry logic"""
    if not pdf_url:
        return False, "No PDF URL provided"

    # Create filename using speech ID
    filename = f"{speech_id}.pdf"
    filepath = os.path.join(pdf_directory, filename)

    # Skip if file already exists
    if os.path.exists(filepath):
        return True, f"File already exists: {filename}"

    # Attempt download with retries
    for attempt in range(MAX_RETRIES):
        try:
            print(f"    Downloading PDF (attempt {attempt + 1}/{MAX_RETRIES}): {filename}")
            response = requests.get(pdf_url, timeout=30, stream=True)

            if response.status_code == 200:
                # Write PDF to file
                with open(filepath, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)

                file_size = os.path.getsize(filepath)
                return True, f"Downloaded successfully: {filename} ({file_size:,} bytes)"

            else:
                if attempt == MAX_RETRIES - 1:  # Last attempt
                    return False, f"HTTP {response.status_code} after {MAX_RETRIES} attempts"
                time.sleep(1)  # Wait before retry

        except requests.exceptions.RequestException as e:
            if attempt == MAX_RETRIES - 1:  # Last attempt
                return False, f"Network error after {MAX_RETRIES} attempts: {str(e)}"
            time.sleep(1)  # Wait before retry

        except Exception as e:
            if attempt == MAX_RETRIES - 1:  # Last attempt
                return False, f"Unexpected error after {MAX_RETRIES} attempts: {str(e)}"
            time.sleep(1)  # Wait before retry

    return False, "Download failed"
def get_file_size_estimate(url):
    """Get estimated file size from URL headers without downloading"""
    try:
        response = requests.head(url, timeout=10)
        if response.status_code == 200:
            content_length = response.headers.get('content-length')
            if content_length:
                return int(content_length)
    except:
        pass
    return None

def extract_speech_metadata(item, official_name, official_info, speech_counter=None):
    """Extract speech metadata from an item - NO TEXT CONTENT"""
    title = "Untitled"
    date = "Unknown"
    text_url = ""
    pdf_url = ""
    html_url = ""
    estimated_pdf_size = None

    # Extract title
    if 'titleInfo' in item:
        title_info = item['titleInfo']
        if isinstance(title_info, list) and title_info:
            title = title_info[0].get('title', 'Untitled')
        elif isinstance(title_info, dict):
            title = title_info.get('title', 'Untitled')

    # Extract date
    if 'originInfo' in item:
        origin = item['originInfo']
        if 'dateIssued' in origin:
            date_issued = origin['dateIssued']
            if isinstance(date_issued, list):
                if date_issued and isinstance(date_issued[0], dict):
                    date = date_issued[0].get('date', str(date_issued[0]))
                else:
                    date = str(date_issued[0]) if date_issued else 'Unknown'
            elif isinstance(date_issued, dict):
                date = date_issued.get('date', str(date_issued))
            else:
                date = str(date_issued)
        elif 'sortDate' in origin:
            date = origin['sortDate']

    # Generate speech ID with counter to ensure uniqueness
    speech_id = generate_speech_id(official_name, date, speech_counter)

    # Extract URLs from location
    if 'location' in item:
        location = item['location']

        # Get text URL (FRASER's extracted text)
        if 'textUrl' in location:
            text_urls = location['textUrl']
            text_url = text_urls[0] if isinstance(text_urls, list) else text_urls

        # Get PDF URL
        if 'pdfUrl' in location:
            pdf_urls = location['pdfUrl']
            pdf_url = pdf_urls[0] if isinstance(pdf_urls, list) else pdf_urls

            # Get estimated PDF size
            if pdf_url:
                estimated_pdf_size = get_file_size_estimate(pdf_url)

        # Get HTML URL (if different from text URL)
        if 'url' in location:
            urls = location['url']
            html_url = urls[0] if isinstance(urls, list) else urls

    return {
        'id': speech_id,
        'date': date,
        'speaker': official_name,
        'role': official_info['role'],
        'years_served': official_info['years'],
        'title': title,
        'text_url': text_url,
        'pdf_url': pdf_url,
        'html_url': html_url,
        'estimated_pdf_size_bytes': estimated_pdf_size,
        'pdf_downloaded': False,  # Will be updated after download attempt
        'pdf_download_status': ''  # Will contain download result message
    }

def main():
    print("Federal Reserve Officials Speech Links Scraper")
    print("=" * 60)

    # Create output directory and PDF subdirectory
    pdf_directory = create_output_directory()

    print(f"Output directory: {OUTPUT_DIR}")
    print(f"PDF directory: {pdf_directory}")
    print(f"Output file: {OUTPUT_FILE}")
    print(f"PDF downloads enabled: {DOWNLOAD_PDFS}")
    print(f"Searching for speeches from {len(OFFICIALS)} Fed officials...")

    all_speeches = []
    total_speeches = 0
    official_counts = {}
    download_stats = {'attempted': 0, 'successful': 0, 'failed': 0, 'skipped': 0}

    for official_name, official_info in OFFICIALS.items():
        print(f"\n{'='*50}")
        print(f"Processing: {official_name} ({official_info['role']}, {official_info['years']})")

        # Search for this official's speeches
        speech_items = search_for_official_speeches(official_name, official_info)

        if not speech_items:
            print(f"  No speeches found for {official_name}")
            official_counts[official_name] = 0
            continue

        # Process each speech
        official_speeches = []
        for i, item in enumerate(speech_items, 1):
            print(f"    Processing speech {i}/{len(speech_items)}...")

            speech_metadata = extract_speech_metadata(item, official_name, official_info, i)

            # Only add speeches from 2000 or later
            date_ok = is_speech_after_2000(speech_metadata['date'])

            if date_ok:
                official_speeches.append(speech_metadata)
                all_speeches.append(speech_metadata)

                # Download PDF if enabled and URL is available
                if DOWNLOAD_PDFS and speech_metadata['pdf_url']:
                    download_stats['attempted'] += 1
                    success, message = download_pdf(
                        speech_metadata['pdf_url'],
                        speech_metadata['id'],
                        pdf_directory
                    )

                    # Update speech metadata with download results
                    speech_metadata['pdf_downloaded'] = success
                    speech_metadata['pdf_download_status'] = message

                    if success:
                        download_stats['successful'] += 1
                        print(f"      ‚úì {message}")
                    else:
                        download_stats['failed'] += 1
                        print(f"      ‚úó {message}")

                    # Rate limiting for downloads
                    time.sleep(DOWNLOAD_DELAY)

                elif not speech_metadata['pdf_url']:
                    download_stats['skipped'] += 1
                    speech_metadata['pdf_download_status'] = 'No PDF URL available'
                else:
                    speech_metadata['pdf_download_status'] = 'PDF downloads disabled'

            else:
                print(f"      Skipped: Date {speech_metadata['date']} before 2000 - '{speech_metadata['title'][:50]}...'")

            time.sleep(0.1)  # Be nice to API

        print(f"   Collected {len(official_speeches)} speeches for {official_name}")
        total_speeches += len(official_speeches)
        official_counts[official_name] = len(official_speeches)

        # Longer pause between officials
        time.sleep(1)

    # Sort all speeches by date (newest first)
    def sort_key(x):
        try:
            return datetime.strptime(str(x.get('date', 'Unknown')), '%Y-%m-%d %H:%M:%S')
        except ValueError:
            try:
                return datetime.strptime(str(x.get('date', 'Unknown')), '%Y-%m-%d')
            except ValueError:
                return datetime.min

    all_speeches.sort(key=sort_key, reverse=True)

    # Save to single CSV file
    output_path = os.path.join(OUTPUT_DIR, OUTPUT_FILE)
    print(f"\nSaving {len(all_speeches)} speech records to {output_path}...")

    with open(output_path, 'w', newline='', encoding='utf-8') as f:
        fieldnames = ['id', 'date', 'speaker', 'role', 'years_served', 'title',
                     'text_url', 'pdf_url', 'html_url', 'estimated_pdf_size_bytes',
                     'pdf_downloaded', 'pdf_download_status']
        writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
        writer.writeheader()
        writer.writerows(all_speeches)

    # Calculate PDF storage estimates
    total_pdf_size = 0
    pdfs_with_size = 0
    for speech in all_speeches:
        if speech['estimated_pdf_size_bytes']:
            total_pdf_size += speech['estimated_pdf_size_bytes']
            pdfs_with_size += 1

    # Summary
    print(f"\nüìà COLLECTION SUMMARY:")
    print(f"Total speech records: {total_speeches:,}")
    print(f"Officials with speeches found: {sum(1 for count in official_counts.values() if count > 0)}/{len(OFFICIALS)}")
    print(f"Speech records with PDF URLs: {sum(1 for s in all_speeches if s['pdf_url'])}")
    print(f"Speech records with estimated PDF sizes: {pdfs_with_size}")

    if DOWNLOAD_PDFS:
        print(f"\nüì• PDF DOWNLOAD SUMMARY:")
        print(f"Download attempts: {download_stats['attempted']}")
        print(f"Successful downloads: {download_stats['successful']}")
        print(f"Failed downloads: {download_stats['failed']}")
        print(f"Skipped (no PDF URL): {download_stats['skipped']}")
        if download_stats['attempted'] > 0:
            success_rate = (download_stats['successful'] / download_stats['attempted']) * 100
            print(f"Success rate: {success_rate:.1f}%")

    if pdfs_with_size > 0:
        avg_pdf_size = total_pdf_size / pdfs_with_size
        estimated_total_size = avg_pdf_size * sum(1 for s in all_speeches if s['pdf_url'])
        print(f"\nüìä STORAGE ESTIMATES:")
        print(f"Average PDF size: {avg_pdf_size/1024/1024:.1f} MB")
        print(f"Estimated total PDF storage needed: {estimated_total_size/1024/1024/1024:.1f} GB")

        if DOWNLOAD_PDFS and download_stats['successful'] > 0:
            # Calculate actual downloaded size
            actual_size = 0
            for speech in all_speeches:
                if speech['pdf_downloaded']:
                    pdf_path = os.path.join(pdf_directory, f"{speech['id']}.pdf")
                    if os.path.exists(pdf_path):
                        actual_size += os.path.getsize(pdf_path)
            print(f"Actual downloaded size: {actual_size/1024/1024:.1f} MB")

    print(f"\nTop 10 Officials by Speech Count:")
    sorted_officials = sorted(official_counts.items(), key=lambda x: x[1], reverse=True)
    for i, (name, count) in enumerate(sorted_officials[:10], 1):
        if count > 0:
            print(f"{i:2d}. {name}: {count} speeches")

    print(f"\nResults saved to: {output_path}")
    if DOWNLOAD_PDFS:
        print(f"PDFs saved to: {pdf_directory}")

if __name__ == "__main__":
    main()

Federal Reserve Officials Speech Links Scraper
Created output directory: /content/drive/MyDrive/Speeches
Created PDF directory: /content/drive/MyDrive/Speeches/pdfs
Output directory: /content/drive/MyDrive/Speeches
PDF directory: /content/drive/MyDrive/Speeches/pdfs
Output file: speechlinks.csv
PDF downloads enabled: True
Searching for speeches from 27 Fed officials...

Processing: Alan Greenspan (Chair, 1987-2006)
 Searching for Alan Greenspan speeches...
  Getting speeches from title ID 452...
    Found 505 items in title collection
  Found 505 speeches from title collection
    Processing speech 1/505...
      Skipped: Date September 9, 1987 before 2000 - 'Jacksonville Branch Dedication...'
    Processing speech 2/505...
      Skipped: Date October 5, 1987 before 2000 - 'Testimony before the Subcommittee on Telecommunica...'
    Processing speech 3/505...
      Skipped: Date November 2, 1987 before 2000 - 'Introductory Remarks at a Dinner Commemorating the...'
    Processing speech 

KeyboardInterrupt: 