In [1]:
!cd /tmp && pip install beautifulsoup4 html5lib python-dateutil requests pandas



In [None]:
!pip install pdfplumber -q

import pandas as pd
import requests
import pdfplumber
from io import BytesIO
from datetime import datetime
import os
import time
from google.colab import drive
import re

# Mount Google Drive
print("Mounting Google Drive...")
drive.mount("/content/drive", force_remount=True)

# Set output directory
output_dir = '/content/drive/MyDrive/PressConferences'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
os.chdir(output_dir)
print(f"Current working directory: {os.getcwd()}")

# Verify write access to output directory
test_file = os.path.join(output_dir, 'test_write.txt')
try:
    with open(test_file, 'w') as f:
        f.write('Test write to Google Drive')
    print(f"Write test successful: {test_file}")
    os.remove(test_file)
except Exception as e:
    print(f"Error writing to Google Drive: {e}")
    exit()

# Hardcoded list of FOMC meeting dates from April 2011 to July 2025
fomc_dates = [
    # 2011
    '2011-04-27', '2011-06-22', '2011-08-09', '2011-09-21', '2011-11-02', '2011-12-13',
    # 2012
    '2012-01-25', '2012-03-13', '2012-04-25', '2012-06-20', '2012-08-01', '2012-09-13', '2012-12-12',
    # 2013
    '2013-03-20', '2013-05-01', '2013-06-19', '2013-07-31', '2013-09-18', '2013-10-30', '2013-12-18',
    # 2014
    '2014-01-29', '2014-03-19', '2014-04-30', '2014-06-18', '2014-07-30', '2014-09-17', '2014-10-29', '2014-12-17',
    # 2015
    '2015-01-28', '2015-03-18', '2015-04-29', '2015-06-17', '2015-07-29', '2015-09-17', '2015-10-28', '2015-12-16',
    # 2016
    '2016-01-27', '2016-03-16', '2016-04-27', '2016-06-15', '2016-07-27', '2016-09-21', '2016-11-02', '2016-12-14',
    # 2017
    '2017-02-01', '2017-03-15', '2017-05-03', '2017-06-14', '2017-07-26', '2017-09-20', '2017-11-01', '2017-12-13',
    # 2018
    '2018-01-31', '2018-03-21', '2018-05-02', '2018-06-13', '2018-08-01', '2018-09-26', '2018-11-08', '2018-12-19',
    # 2019
    '2019-01-30', '2019-03-20', '2019-05-01', '2019-06-19', '2019-07-31', '2019-09-18', '2019-10-30', '2019-12-11',
    # 2020
    '2020-01-29', '2020-03-15', '2020-04-29', '2020-06-10', '2020-07-29', '2020-09-16', '2020-11-05', '2020-12-16',
    # 2021
    '2021-01-27', '2021-03-17', '2021-04-28', '2021-06-16', '2021-07-28', '2021-09-22', '2021-11-03', '2021-12-15',
    # 2022
    '2022-01-26', '2022-03-16', '2022-05-04', '2022-06-15', '2022-07-27', '2022-09-21', '2022-11-02', '2022-12-14',
    # 2023
    '2023-02-01', '2023-03-22', '2023-05-03', '2023-06-14', '2023-07-26', '2023-09-20', '2023-11-01', '2023-12-13',
    # 2024
    '2024-01-31', '2024-03-20', '2024-05-01', '2024-06-12', '2024-07-31', '2024-09-18', '2024-11-07', '2024-12-18',
    # 2025
    '2025-01-29', '2025-03-19', '2025-05-07', '2025-06-18', '2025-07-30'
]

# Filter dates to ensure they are on or before the current date (September 18, 2025)
current_date = datetime(2025, 9, 18)
fomc_dates = [date for date in fomc_dates if datetime.strptime(date, '%Y-%m-%d') <= current_date]

# DEBUG MODE: Set to True to scrape only the most recent conference for testing
DEBUG_MODE = False

if DEBUG_MODE:
    fomc_dates = [fomc_dates[-1]]  # Only process the most recent date
    print(f"DEBUG MODE: Only processing {fomc_dates[0]}")
else:
    print(f"Processing {len(fomc_dates)} total dates")

# Load existing CSV if it exists
csv_path = os.path.join(output_dir, 'fomc_press_conferences.csv')
existing_dates = set()
existing_ids = set()
try:
    existing_df = pd.read_csv(csv_path)
    existing_ids = set(existing_df['id'])
    print(f"Loaded existing CSV with {len(existing_ids)} records")
except FileNotFoundError:
    print("No existing CSV found, starting fresh")
except Exception as e:
    print(f"Error reading existing CSV: {e}")
    existing_df = pd.DataFrame(columns=['id', 'date', 'source_url', 'text', 'speaker'])

# Function to determine speaker based on date
def get_speaker(date_str):
    date = datetime.strptime(date_str, '%Y-%m-%d')
    if date < datetime(2014, 2, 1):
        return 'Bernanke'
    elif date < datetime(2018, 2, 1):
        return 'Yellen'
    else:
        return 'Powell'

# Function to clean text - removes page numbers and fixes spacing issues
def clean_text(text):
    # Remove various page headers and footers
    text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text)
    text = re.sub(r'\d+\s+of\s+\d+', '', text)
    text = re.sub(r'Transcript of Chair.*?Press Conference.*?\d{4}', '', text)
    text = re.sub(r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\s+Chairman.*?Press Conference.*?FINAL', '', text)
    text = re.sub(r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\s+Chair.*?Press Conference.*?FINAL', '', text)
    text = re.sub(r'FINAL\s*Page', '', text)

    # Fix specific mid-word spacing issues that occur in PDF extraction
    # Only fix cases where we know the words should be joined
    text = re.sub(r'\bFederal\s+Open\s+M\s+arket\b', 'Federal Open Market', text)
    text = re.sub(r'\bFederal\s+R\s+eserve\b', 'Federal Reserve', text)
    text = re.sub(r'\bC\s+ommittee\b', 'Committee', text)
    text = re.sub(r'\bM\s+arket\b', 'Market', text)
    text = re.sub(r'\bR\s+eserve\b', 'Reserve', text)
    text = re.sub(r'\bP\s+owell\b', 'Powell', text)
    text = re.sub(r'\bY\s+ellen\b', 'Yellen', text)
    text = re.sub(r'\bB\s+ernanke\b', 'Bernanke', text)

    # Clean up multiple spaces and tabs, but preserve single spaces between words
    text = re.sub(r'[ \t]+', ' ', text)

    # Clean up multiple newlines
    text = re.sub(r'\n\s*\n+', '\n\n', text)

    # Remove duplicate lines while preserving structure
    lines = text.split('\n')
    seen = set()
    cleaned_lines = []
    for line in lines:
        line_stripped = line.strip()
        if line_stripped and line_stripped not in seen:
            cleaned_lines.append(line_stripped)
            seen.add(line_stripped)
        elif line_stripped == '' and (not cleaned_lines or cleaned_lines[-1] != ''):
            # Only add empty lines if the previous line wasn't empty
            cleaned_lines.append('')

    return '\n'.join(cleaned_lines).strip()

# Function to extract text from PDF using pdfplumber
def extract_text_from_pdf(pdf_content):
    with pdfplumber.open(BytesIO(pdf_content)) as pdf:
        full_text = ''
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                full_text += page_text + '\n'
    return full_text

# Function to extract both chair's text and other speakers' text from PDF
def extract_text_by_speaker(pdf_content, date_str):
    # Use pdfplumber instead of PyPDF2 for cleaner text extraction
    full_text = extract_text_from_pdf(pdf_content)

    # Log first 500 characters of raw text for debugging
    print(f"Raw text preview for {date_str} (first 500 chars): {full_text[:500]}")

    speaker = get_speaker(date_str)
    chair_text = ''
    other_text = ''

    # Split text into lines for processing
    lines = full_text.split('\n')
    current_speaker = None
    current_text = []

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Check if this line indicates the chair is speaking
        if (line.startswith(f'CHAIR {speaker.upper()}.') or
            line.startswith(f'CHAIRMAN {speaker.upper()}.') or
            line.startswith('CHAIR POWELL.') or
            line.startswith('CHAIR BERNANKE.') or
            line.startswith('CHAIR YELLEN.') or
            line.startswith('CHAIRWOMAN YELLEN.')):

            # Save previous speaker's text
            if current_speaker and current_text:
                text_block = '\n'.join(current_text)
                if current_speaker == 'chair':
                    chair_text += text_block + '\n'
                else:
                    other_text += text_block + '\n'

            current_speaker = 'chair'
            current_text = [line]

        # Check if this line indicates someone else is speaking (more permissive patterns)
        elif (line.startswith('QUESTION.') or
              line.startswith('MS.') or line.startswith('MR.') or
              line.startswith('QUESTIONER.') or
              # Look for reporter names or other officials
              re.match(r'^[A-Z][A-Z\s]+[A-Z]\.', line) or
              # Look for patterns like "STEVE LIESMAN." or "HOWARD SCHNEIDER."
              re.match(r'^[A-Z]+\s+[A-Z]+\.', line)):

            # Save previous speaker's text
            if current_speaker and current_text:
                text_block = '\n'.join(current_text)
                if current_speaker == 'chair':
                    chair_text += text_block + '\n'
                else:
                    other_text += text_block + '\n'

            current_speaker = 'other'
            current_text = [line]

        # Continue with current speaker
        elif current_speaker and line:
            current_text.append(line)

    # Don't forget the last speaker's text
    if current_speaker and current_text:
        text_block = '\n'.join(current_text)
        if current_speaker == 'chair':
            chair_text += text_block + '\n'
        else:
            other_text += text_block + '\n'

    # Clean both texts
    chair_text = clean_text(chair_text)
    other_text = clean_text(other_text)

    print(f"Chair text length for {date_str}: {len(chair_text)}")
    print(f"Other text length for {date_str}: {len(other_text)}")
    print(f"Other text preview for {date_str} (first 200 chars): {other_text[:200]}")

    return chair_text, other_text

# Function to scrape press conference for a given date
def scrape_press_conf(date):
    date_obj = datetime.strptime(date, '%Y-%m-%d')
    date_str = date_obj.strftime('%Y%m%d')

    # Check if both records already exist
    chair_id = f"presschair_{date_str}"
    other_id = f"pressother_{date_str}"

    if chair_id in existing_ids and other_id in existing_ids:
        print(f"Both records for {date} already exist, skipping")
        return []

    # URL format
    url = f'https://www.federalreserve.gov/mediacenter/files/FOMCpresconf{date_str}.pdf'

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        print(f"Response status for {date} at {url}: {response.status_code}, Content length: {len(response.content)}")

        chair_text, other_text = extract_text_by_speaker(response.content, date)

        results = []
        speaker_name = get_speaker(date)

        # Add chair record if it has meaningful content and doesn't exist
        if len(chair_text) > 50 and "CHAIR" in chair_text and chair_id not in existing_ids:
            results.append({
                'id': chair_id,
                'date': date,
                'source_url': url,
                'text': chair_text,
                'speaker': speaker_name
            })
            print(f"Chair record: {len(chair_text)} characters for {date}")

        # Add other speakers record if it has meaningful content and doesn't exist
        if len(other_text) > 50 and other_id not in existing_ids:
            results.append({
                'id': other_id,
                'date': date,
                'source_url': url,
                'text': other_text,
                'speaker': 'Other'
            })
            print(f"Other speakers record: {len(other_text)} characters for {date}")

        if results:
            print(f"Success: Created {len(results)} records for {date}")
            return results
        else:
            print(f"No new records needed for {date}")
            return []

    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error for {date} at {url}: {e} (Status: {e.response.status_code if e.response else 'No response'})")
    except requests.exceptions.RequestException as e:
        print(f"Request error for {date} at {url}: {e}")
    except Exception as e:
        print(f"General error for {date} at {url}: {e}")

    print(f"Failed to scrape press conference for {date}")
    return []

# Scrape and save data
data = []
successful_scrapes = 0
failed_scrapes = 0
successes_by_year = {}

for date in fomc_dates:
    year = date.split('-')[0]
    if year not in successes_by_year:
        successes_by_year[year] = 0

    results = scrape_press_conf(date)
    if results:
        data.extend(results)
        successful_scrapes += len(results)
        successes_by_year[year] += len(results)
    else:
        failed_scrapes += 1

    # Save incrementally every 10 dates or at the end
    if len(data) >= 20 or (date == fomc_dates[-1] and data):
        new_df = pd.DataFrame(data)
        try:
            if os.path.exists(csv_path):
                existing_df = pd.read_csv(csv_path)
                combined_df = pd.concat([existing_df, new_df], ignore_index=True)
            else:
                combined_df = new_df
            combined_df.to_csv(csv_path, index=False)
            print(f"Saved {len(data)} new records to {csv_path}")
            data = []  # Clear data after saving
        except Exception as e:
            print(f"Error saving to CSV: {e}")

    time.sleep(1)  # Avoid overwhelming the server

# Print summary of successes by year
print("\nSuccessful scrapes by year:")
for year, count in successes_by_year.items():
    print(f"{year}: {count} records")

# Print overall summary
print(f"\nScraping complete: {successful_scrapes} successful records, {failed_scrapes} failed dates")
if os.path.exists(csv_path):
    try:
        final_df = pd.read_csv(csv_path)
        print(f"Final CSV contains {len(final_df)} records")
        print(f"File size: {os.path.getsize(csv_path) / (1024 * 1024):.2f} MB")

        # Show breakdown by speaker
        speaker_counts = final_df['speaker'].value_counts()
        print("\nRecords by speaker:")
        for speaker, count in speaker_counts.items():
            print(f"{speaker}: {count} records")

    except Exception as e:
        print(f"Error reading final CSV: {e}")

Mounting Google Drive...
Mounted at /content/drive
Current working directory: /content/drive/MyDrive/PressConferences
Write test successful: /content/drive/MyDrive/PressConferences/test_write.txt
Processing 113 total dates
No existing CSV found, starting fresh
Response status for 2011-04-27 at https://www.federalreserve.gov/mediacenter/files/FOMCpresconf20110427.pdf: 200, Content length: 76622
