In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import os
output_dir = '/content/drive/MyDrive/Statements'

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

os.chdir(output_dir)

# Verify the current working directory
print(f"Current working directory: {os.getcwd()}")

Mounted at /content/drive
Current working directory: /content/drive/MyDrive/Statements


In [None]:
!cd /tmp && pip install beautifulsoup4 html5lib python-dateutil requests pandas
!pip install dropbox

Collecting dropbox
  Downloading dropbox-12.0.2-py3-none-any.whl.metadata (4.3 kB)
Collecting stone<3.3.3,>=2 (from dropbox)
  Downloading stone-3.3.1-py3-none-any.whl.metadata (8.0 kB)
Downloading dropbox-12.0.2-py3-none-any.whl (572 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m572.1/572.1 kB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading stone-3.3.1-py3-none-any.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.3/162.3 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: stone, dropbox
Successfully installed dropbox-12.0.2 stone-3.3.1


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os
import time
from google.colab import drive
import re
import uuid
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


# Mount Google Drive
print("Mounting Google Drive...")
drive.mount("/content/drive", force_remount=True)

# Set output directory
output_dir = '/content/drive/MyDrive/Statements'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
os.chdir(output_dir)
print(f"Current working directory: {os.getcwd()}")

# Verify write access to output directory
test_file = os.path.join(output_dir, 'test_write.txt')
try:
    with open(test_file, 'w') as f:
        f.write('Test write to Google Drive')
    print(f"Write test successful: {test_file}")
    os.remove(test_file)
except Exception as e:
    print(f"Error writing to Google Drive: {e}")
    exit()

# Hardcoded list of FOMC meeting dates from 2000 to January 2025
fomc_dates = [
    # 2000
    '2000-02-02', '2000-03-21', '2000-05-16', '2000-06-28', '2000-08-22', '2000-10-03', '2000-11-15', '2000-12-19',
    # 2001
    '2001-01-31', '2001-03-20', '2001-05-15', '2001-06-27', '2001-08-21', '2001-10-02', '2001-11-06', '2001-12-11',
    # 2002
    '2002-01-30', '2002-03-19', '2002-05-07', '2002-06-26', '2002-08-13', '2002-09-24', '2002-11-06', '2002-12-10',
    # 2003
    '2003-01-29', '2003-03-18', '2003-05-06', '2003-06-25', '2003-08-12', '2003-09-16', '2003-10-28', '2003-12-09',
    # 2004
    '2004-01-28', '2004-03-16', '2004-05-04', '2004-06-30', '2004-08-10', '2004-09-21', '2004-11-10', '2004-12-14',
    # 2005
    '2005-02-02', '2005-03-22', '2005-05-03', '2005-06-30', '2005-08-09', '2005-09-20', '2005-11-01', '2005-12-13',
    # 2006
    '2006-01-31', '2006-03-28', '2006-05-10', '2006-06-29', '2006-08-08', '2006-09-20', '2006-10-25', '2006-12-12',
    # 2007
    '2007-01-31', '2007-03-21', '2007-05-09', '2007-06-28', '2007-08-07', '2007-09-18', '2007-10-31', '2007-12-11',
    # 2008
    '2008-01-30', '2008-03-18', '2008-04-30', '2008-06-25', '2008-08-05', '2008-09-16', '2008-10-29', '2008-12-16',
    # 2009
    '2009-01-28', '2009-03-18', '2009-04-29', '2009-06-24', '2009-08-12', '2009-09-23', '2009-11-04', '2009-12-16',
    # 2010
    '2010-01-27', '2010-03-16', '2010-04-28', '2010-06-23', '2010-08-10', '2010-09-21', '2010-11-03', '2010-12-14',
    # 2011
    '2011-01-26', '2011-03-15', '2011-04-27', '2011-06-22', '2011-08-09', '2011-09-21', '2011-11-02', '2011-12-13',
    # 2012
    '2012-01-25', '2012-03-13', '2012-04-25', '2012-06-20', '2012-08-01', '2012-09-13', '2012-12-12',
    # 2013
    '2013-03-20', '2013-05-01', '2013-06-19', '2013-07-31', '2013-09-18', '2013-10-30', '2013-12-18',
    # 2014
    '2014-01-29', '2014-03-19', '2014-04-30', '2014-06-18', '2014-07-30', '2014-09-17', '2014-10-29', '2014-12-17',
    # 2015
    '2015-01-28', '2015-03-18', '2015-04-29', '2015-06-17', '2015-07-29', '2015-09-17', '2015-10-28', '2015-12-16',
    # 2016
    '2016-01-27', '2016-03-16', '2016-04-27', '2016-06-15', '2016-07-27', '2016-09-21', '2016-11-02', '2016-12-14',
    # 2017
    '2017-02-01', '2017-03-15', '2017-05-03', '2017-06-14', '2017-07-26', '2017-09-20', '2017-11-01', '2017-12-13',
    # 2018
    '2018-01-31', '2018-03-21', '2018-05-02', '2018-06-13', '2018-08-01', '2018-09-26', '2018-11-08', '2018-12-19',
    # 2019
    '2019-01-30', '2019-03-20', '2019-05-01', '2019-06-19', '2019-07-31', '2019-09-18', '2019-10-30', '2019-12-11',
    # 2020
    '2020-01-29', '2020-03-15', '2020-04-29', '2020-06-10', '2020-07-29', '2020-09-16', '2020-11-05', '2020-12-16',
    # 2021
    '2021-01-27', '2021-03-17', '2021-04-28', '2021-06-16', '2021-07-28', '2021-09-22', '2021-11-03', '2021-12-15',
    # 2022
    '2022-01-26', '2022-03-16', '2022-05-04', '2022-06-15', '2022-07-27', '2022-09-21', '2022-11-02', '2022-12-14',
    # 2023
    '2023-02-01', '2023-03-22', '2023-05-03', '2023-06-14', '2023-07-26', '2023-09-20', '2023-11-01', '2023-12-13',
    # 2024
    '2024-01-31', '2024-03-20', '2024-05-01', '2024-06-12', '2024-07-31', '2024-09-18', '2024-11-07', '2024-12-18',
    # 2025
    '2025-01-29'
]

# Filter dates to ensure they are on or before the current date (September 18, 2025)
current_date = datetime(2025, 9, 18)
fomc_dates = [date for date in fomc_dates if datetime.strptime(date, '%Y-%m-%d') <= current_date]

# Load existing CSV if it exists
csv_path = os.path.join(output_dir, 'fomc_statements.csv')
existing_dates = set()
try:
    existing_df = pd.read_csv(csv_path)
    existing_dates = set(existing_df['date'])
    print(f"Loaded existing CSV with {len(existing_dates)} records")
except FileNotFoundError:
    print("No existing CSV found, starting fresh")
except Exception as e:
    print(f"Error reading existing CSV: {e}")
    existing_df = pd.DataFrame(columns=['id', 'date', 'source_url', 'text'])

# Set up requests session with retry logic
session = requests.Session()
retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))

# Function to clean text and remove duplicates
def clean_text(text):
    # Remove JavaScript warning
    text = re.sub(r'Please enable JavaScript if it is disabled in your browser.*?\n', '', text, flags=re.DOTALL)
    # Remove excessive newlines and redundant sections
    text = re.sub(r'\n\s*\n', '\n', text.strip())
    # Remove repeated sections by keeping only the first occurrence
    lines = text.split('\n')
    seen = set()
    cleaned_lines = []
    for line in lines:
        if line.strip() and line.strip() not in seen:
            cleaned_lines.append(line)
            seen.add(line.strip())
        elif line.strip() == '':
            cleaned_lines.append(line)
    return '\n'.join(cleaned_lines)

# Function to scrape statements for a given date
def scrape_statements(date):
    date_obj = datetime.strptime(date, '%Y-%m-%d')
    year = date_obj.year
    date_str = date_obj.strftime('%Y%m%d')
    statement_id = f"statement_{date_str}"  # Create ID in format statement_YYYYMMDD

    # Define URL formats based on year and specific dates
    url_monetary = f'https://www.federalreserve.gov/boarddocs/press/monetary/{year}/{date_str}/'
    url_newsevents = f'https://www.federalreserve.gov/newsevents/pressreleases/monetary{date_str}a.htm'
    url_general = f'https://www.federalreserve.gov/boarddocs/press/general/{year}/{date_str}/'

    # Specific URLs for known cases
    specific_urls = {
        '2000-02-02': 'https://www.federalreserve.gov/boarddocs/press/general/2000/20000202/',
        '2002-03-19': 'https://www.federalreserve.gov/boarddocs/press/general/2002/20020319/',
        '2005-05-03': 'https://www.federalreserve.gov/boarddocs/press/monetary/2005/20050503/',
        '2007-06-28': 'https://www.federalreserve.gov/newsevents/pressreleases/monetary20070618a.htm'
    }

    # Select URLs to try based on year and specific cases
    if date in specific_urls:
        urls_to_try = [specific_urls[date]]
    elif year <= 2002:
        urls_to_try = [url_general, url_monetary, url_newsevents]
    elif year <= 2005:
        urls_to_try = [url_monetary, url_newsevents, url_general]
    else:
        urls_to_try = [url_newsevents, url_monetary]

    for url in urls_to_try:
        try:
            response = session.get(url, timeout=10)
            response.raise_for_status()
            print(f"Response status for {date} at {url}: {response.status_code}, Content length: {len(response.text)}")

            soup = BeautifulSoup(response.text, 'html.parser')

            # Try multiple selectors to extract content
            content_selectors = [
                ('div', {'id': 'content'}),
                ('div', {'class': 'article'}),
                ('div', {'class': 'col-xs-12 col-sm-12 col-md-10'}),
                ('div', {'class': 'panel-body'}),
                ('div', {'class': 'panel panel-default'}),
                ('body', {})
            ]

            text = None
            for tag, attrs in content_selectors:
                content = soup.find(tag, attrs)
                if content:
                    text = content.get_text(separator='\n', strip=True)
                    text = clean_text(text)
                    if len(text) > 50 and "Federal Open Market Committee" in text:
                        print(f"Success: Scraped {len(text)} characters for {date} from {url}")
                        return {'id': statement_id, 'date': date, 'source_url': url, 'text': text}

            # Fallback to body text if no specific content div is found
            if not text:
                text = soup.get_text(separator='\n', strip=True)
                text = clean_text(text)
                if len(text) > 50 and "Federal Open Market Committee" in text:
                    print(f"Success (fallback): Scraped {len(text)} characters for {date} from {url}")
                    return {'id': statement_id, 'date': date, 'source_url': url, 'text': text}

            print(f"No meaningful content found for {date} at {url} (text length: {len(text) if text else 0})")
        except requests.exceptions.HTTPError as e:
            print(f"HTTP Error for {date} at {url}: {e} (Status: {e.response.status_code if e.response else 'No response'})")
        except requests.exceptions.RequestException as e:
            print(f"Request error for {date} at {url}: {e}")
        except Exception as e:
            print(f"General error for {date} at {url}: {e}")

    print(f"Failed to scrape statement for {date}")
    return None

# Scrape and save data
data = []
successful_scrapes = 0
failed_scrapes = 0

for date in fomc_dates:
    if date in existing_dates:
        print(f"Skipping {date}: already in CSV")
        continue

    result = scrape_statements(date)
    if result:
        data.append(result)
        successful_scrapes += 1
    else:
        failed_scrapes += 1

    # Save incrementally every 10 dates or at the end
    if len(data) >= 10 or (date == fomc_dates[-1] and data):
        new_df = pd.DataFrame(data)
        try:
            if os.path.exists(csv_path):
                existing_df = pd.read_csv(csv_path)
                combined_df = pd.concat([existing_df, new_df], ignore_index=True)
            else:
                combined_df = new_df
            combined_df.to_csv(csv_path, index=False)
            print(f"Saved {len(data)} new records to {csv_path}")
            data = []  # Clear data after saving
        except Exception as e:
            print(f"Error saving to CSV: {e}")

    time.sleep(1)  # Avoid overwhelming the server

# Print summary
print(f"\nScraping complete: {successful_scrapes} successful, {failed_scrapes} failed")
if os.path.exists(csv_path):
    try:
        final_df = pd.read_csv(csv_path)
        print(f"Final CSV contains {len(final_df)} records")
        print(f"File size: {os.path.getsize(csv_path) / (1024 * 1024):.2f} MB")
    except Exception as e:
        print(f"Error reading final CSV: {e}")

In [None]:
#@title Additional Clean
import pandas as pd
import re
import os

# Define paths
input_file = '/content/drive/MyDrive/FedComs/Statements/fomc_statements.csv'
output_dir = '/content/drive/MyDrive/Statements'

# Read the original statements
print("Reading FOMC statements...")
df = pd.read_csv(input_file)
print(f"Loaded {len(df)} statements")

# Delete statement_20081216
print("\nDeleting statement_20081216...")
df = df[df['id'] != 'statement_20081216'].copy()
print(f"Statements remaining: {len(df)}")

# Convert date to datetime for easier filtering
df['date'] = pd.to_datetime(df['date'])

def clean_text_old(text, date):
    """Clean text for statements from 2005-12-13 and prior."""
    # Delete text before and including "For immediate release"
    match = re.search(r'For immediate release', text, re.IGNORECASE)
    if match:
        text = text[match.end():]

    # Delete text after and including "YYYY Monetary policy" where YYYY is the year
    year = date.year
    pattern = rf'{year}\s+Monetary policy'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        text = text[:match.start()]

    return text.strip()

def clean_text_middle(text):
    """Clean text for statements from 2006-01-31 to 2020-03-05."""
    # Delete text before and including the first mention of "share"
    match = re.search(r'share', text, re.IGNORECASE)
    if match:
        text = text[match.end():]

    # Delete text after and including "Last Update"
    match = re.search(r'Last Update', text, re.IGNORECASE)
    if match:
        text = text[:match.start()]

    return text.strip()

def clean_text_recent(text):
    """Clean text for statements after 2020-03-05."""
    # Find "For release" first
    for_release_match = re.search(r'For release', text, re.IGNORECASE)

    if for_release_match:
        # Look for "share" after "For release"
        text_after_release = text[for_release_match.end():]
        share_match = re.search(r'share', text_after_release, re.IGNORECASE)

        if share_match:
            # Calculate position in original text
            start_pos = for_release_match.end() + share_match.end()
            text = text[start_pos:]
    else:
        # If no "For release" found, just look for "share"
        match = re.search(r'share', text, re.IGNORECASE)
        if match:
            text = text[match.end():]

    # Delete text after and including "Last Update"
    match = re.search(r'Last Update', text, re.IGNORECASE)
    if match:
        text = text[:match.start()]

    # Delete text after and including "For media inquiries"
    match = re.search(r'For media inquiries', text, re.IGNORECASE)
    if match:
        text = text[:match.start()]

    return text.strip()

# Apply cleaning based on date
print("\nCleaning statements...")
cleaned_texts = []

for idx, row in df.iterrows():
    if idx % 20 == 0:
        print(f"Processing statement {idx+1}/{len(df)}...")

    date = row['date']
    text = row['text']

    # Apply appropriate cleaning function based on date
    if date <= pd.Timestamp('2005-12-13'):
        cleaned_text = clean_text_old(text, date)
    elif date <= pd.Timestamp('2020-03-05'):
        cleaned_text = clean_text_middle(text)
    else:
        cleaned_text = clean_text_recent(text)

    cleaned_texts.append(cleaned_text)

# Create new dataframe with cleaned text
df_cleaned = df.copy()
df_cleaned['text'] = cleaned_texts

# Convert date back to string format to match original
df_cleaned['date'] = df_cleaned['date'].dt.strftime('%Y-%m-%d')

# Save cleaned statements
output_file = os.path.join(output_dir, 'fomc_statements_cleaned.csv')
df_cleaned.to_csv(output_file, index=False)

print(f"\nCleaned statements saved to: {output_file}")
print(f"Total statements: {len(df_cleaned)}")

# Display some statistics about the cleaning
print("\n" + "="*70)
print("CLEANING STATISTICS")
print("="*70)

# Calculate average text length before and after
original_lengths = df['text'].str.len()
cleaned_lengths = df_cleaned['text'].str.len()

print(f"\nAverage original text length: {original_lengths.mean():.0f} characters")
print(f"Average cleaned text length: {cleaned_lengths.mean():.0f} characters")
print(f"Average reduction: {(original_lengths.mean() - cleaned_lengths.mean()):.0f} characters ({((1 - cleaned_lengths.mean()/original_lengths.mean())*100):.1f}%)")

# Show examples from each period
print("\n" + "="*70)
print("EXAMPLES OF CLEANED TEXT")
print("="*70)

# Example from old period (2005 and prior)
old_example = df_cleaned[df_cleaned['date'] <= '2005-12-13'].iloc[-1]
print(f"\nOLD PERIOD (2005 and prior)")
print(f"Date: {old_example['date']}")
print(f"First 200 characters: {old_example['text'][:200]}...")

# Example from middle period (2006-2020-03-05)
middle_example = df_cleaned[(df_cleaned['date'] > '2005-12-13') & (df_cleaned['date'] <= '2020-03-05')].iloc[0]
print(f"\nMIDDLE PERIOD (2006 to March 2020)")
print(f"Date: {middle_example['date']}")
print(f"First 200 characters: {middle_example['text'][:200]}...")

# Example from recent period (after 2020-03-05)
recent_example = df_cleaned[df_cleaned['date'] > '2020-03-05'].iloc[0]
print(f"\nRECENT PERIOD (after March 2020)")
print(f"Date: {recent_example['date']}")
print(f"First 200 characters: {recent_example['text'][:200]}...")

print("\n" + "="*70)
print("Cleaning complete!")

Reading FOMC statements...
Loaded 199 statements

Deleting statement_20081216...
Statements remaining: 198

Cleaning statements...
Processing statement 1/198...
Processing statement 21/198...
Processing statement 41/198...
Processing statement 61/198...
Processing statement 81/198...
Processing statement 101/198...
Processing statement 121/198...
Processing statement 141/198...
Processing statement 161/198...
Processing statement 181/198...

Cleaned statements saved to: /content/drive/MyDrive/Statements/fomc_statements_cleaned.csv
Total statements: 198

CLEANING STATISTICS

Average original text length: 4247 characters
Average cleaned text length: 2621 characters
Average reduction: 1627 characters (38.3%)

EXAMPLES OF CLEANED TEXT

OLD PERIOD (2005 and prior)
Date: 2005-12-13
First 200 characters: The Federal Open Market Committee decided today to raise its target for the federal funds rate by 25 basis points to 4-1/4 percent.
Despite elevated energy prices and hurricane-related disrup