In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import os
output_dir = '/content/drive/MyDrive/Minutes'

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

os.chdir(output_dir)

# Verify the current working directory
print(f"Current working directory: {os.getcwd()}")

Mounted at /content/drive
Current working directory: /content/drive/MyDrive/Minutes


In [None]:
!cd /tmp && pip install beautifulsoup4 html5lib python-dateutil requests pandas



In [None]:
# Scrape minutes
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os
import time
from google.colab import drive
import re
import uuid
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import csv
import sys

# Mount Google Drive
drive.mount('/content/drive')
output_dir = '/content/drive/MyDrive/Minutes'
os.makedirs(output_dir, exist_ok=True)

# Increase CSV field size limit for very long FOMC minutes
csv.field_size_limit(sys.maxsize)

# Hardcoded list of FOMC meeting dates from 2000 to January 2025
fomc_dates = [
    # 2000
    '2000-02-02', '2000-03-21', '2000-05-16', '2000-06-28', '2000-08-22', '2000-10-03', '2000-11-15', '2000-12-19',
    # 2001
    '2001-01-31', '2001-03-20', '2001-05-15', '2001-06-27', '2001-08-21', '2001-10-02', '2001-11-06', '2001-12-11',
    # 2002
    '2002-01-30', '2002-03-19', '2002-05-07', '2002-06-26', '2002-08-13', '2002-09-24', '2002-11-06', '2002-12-10',
    # 2003
    '2003-01-29', '2003-03-18', '2003-05-06', '2003-06-25', '2003-08-12', '2003-09-16', '2003-10-28', '2003-12-09',
    # 2004
    '2004-01-28', '2004-03-16', '2004-05-04', '2004-06-30', '2004-08-10', '2004-09-21', '2004-11-10', '2004-12-14',
    # 2005
    '2005-02-02', '2005-03-22', '2005-05-03', '2005-06-30', '2005-08-09', '2005-09-20', '2005-11-01', '2005-12-13',
    # 2006
    '2006-01-31', '2006-03-28', '2006-05-10', '2006-06-29', '2006-08-08', '2006-09-20', '2006-10-25', '2006-12-12',
    # 2007
    '2007-01-31', '2007-03-21', '2007-05-09', '2007-06-28', '2007-08-07', '2007-09-18', '2007-10-31', '2007-12-11',
    # 2008
    '2008-01-30', '2008-03-18', '2008-04-30', '2008-06-25', '2008-08-05', '2008-09-16', '2008-10-29', '2008-12-16',
    # 2009
    '2009-01-28', '2009-03-18', '2009-04-29', '2009-06-24', '2009-08-12', '2009-09-23', '2009-11-04', '2009-12-16',
    # 2010
    '2010-01-27', '2010-03-16', '2010-04-28', '2010-06-23', '2010-08-10', '2010-09-21', '2010-11-03', '2010-12-14',
    # 2011
    '2011-01-26', '2011-03-15', '2011-04-27', '2011-06-22', '2011-08-09', '2011-09-21', '2011-11-02', '2011-12-13',
    # 2012
    '2012-01-25', '2012-03-13', '2012-04-25', '2012-06-20', '2012-08-01', '2012-09-13', '2012-12-12',
    # 2013
    '2013-03-20', '2013-05-01', '2013-06-19', '2013-07-31', '2013-09-18', '2013-10-30', '2013-12-18',
    # 2014
    '2014-01-29', '2014-03-19', '2014-04-30', '2014-06-18', '2014-07-30', '2014-09-17', '2014-10-29', '2014-12-17',
    # 2015
    '2015-01-28', '2015-03-18', '2015-04-29', '2015-06-17', '2015-07-29', '2015-09-17', '2015-10-28', '2015-12-16',
    # 2016
    '2016-01-27', '2016-03-16', '2016-04-27', '2016-06-15', '2016-07-27', '2016-09-21', '2016-11-02', '2016-12-14',
    # 2017
    '2017-02-01', '2017-03-15', '2017-05-03', '2017-06-14', '2017-07-26', '2017-09-20', '2017-11-01', '2017-12-13',
    # 2018
    '2018-01-31', '2018-03-21', '2018-05-02', '2018-06-13', '2018-08-01', '2018-09-26', '2018-11-08', '2018-12-19',
    # 2019
    '2019-01-30', '2019-03-20', '2019-05-01', '2019-06-19', '2019-07-31', '2019-09-18', '2019-10-30', '2019-12-11',
    # 2020
    '2020-01-29', '2020-03-15', '2020-04-29', '2020-06-10', '2020-07-29', '2020-09-16', '2020-11-05', '2020-12-16',
    # 2021
    '2021-01-27', '2021-03-17', '2021-04-28', '2021-06-16', '2021-07-28', '2021-09-22', '2021-11-03', '2021-12-15',
    # 2022
    '2022-01-26', '2022-03-16', '2022-05-04', '2022-06-15', '2022-07-27', '2022-09-21', '2022-11-02', '2022-12-14',
    # 2023
    '2023-02-01', '2023-03-22', '2023-05-03', '2023-06-14', '2023-07-26', '2023-09-20', '2023-11-01', '2023-12-13',
    # 2024
    '2024-01-31', '2024-03-20', '2024-05-01', '2024-06-12', '2024-07-31', '2024-09-18', '2024-11-07', '2024-12-18',
    # 2025
    '2025-01-29'
]

# TOGGLE TO DEBUG
years_to_process = None  # Change to desired year(s), e.g., [2000] or [2024, 2025], or None for all
if years_to_process is not None:
    fomc_dates = [date for date in fomc_dates if int(date.split('-')[0]) in years_to_process]
    print(f"Filtering to process only years: {years_to_process}")
else:
    print("Processing all available years")

# Filter dates to ensure they are on or before the current date (September 18, 2025)
current_date = datetime(2025, 9, 18)
fomc_dates = [date for date in fomc_dates if datetime.strptime(date, '%Y-%m-%d') <= current_date]

# Load existing CSV if it exists
csv_path = os.path.join(output_dir, 'fomc_minutes.csv')
existing_dates = set()
existing_df = None
try:
    existing_df = pd.read_csv(csv_path)
    existing_dates = set(existing_df['date'])
    print(f"Loaded existing CSV with {len(existing_dates)} records")
except FileNotFoundError:
    print("No existing CSV found, starting fresh")
    existing_df = pd.DataFrame(columns=['id', 'date', 'source_url', 'text'])
except Exception as e:
    print(f"Error reading existing CSV: {e}")
    existing_df = pd.DataFrame(columns=['id', 'date', 'source_url', 'text'])

# Set up requests session with retry logic
session = requests.Session()
session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'})
retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))

# Function to clean text and preserve sentence structure
def clean_text(text):
    if not text:
        return ""

    # Remove common boilerplate patterns but preserve paragraph structure
    boilerplate_patterns = [
        r'Home \| FOMC.*?(?=\n|$)',
        r'Accessibility.*?(?=\n|$)',
        r'To comment on this site.*?(?=\n|$)',
        r'Last update:.*?(?=\n|$)',
        r'Return to top.*?(?=\n|$)',
        r'Back to Top.*?(?=\n|$)',
        r'PDF.*?FOMC.*?Minutes.*?(?=\n|$)',
        r'Federal Reserve System.*?Board of Governors.*?(?=\n|$)',
        r'Skip to main content.*?(?=\n|$)',
        r'Print\s*Email\s*Share.*?(?=\n|$)',
    ]

    for pattern in boilerplate_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    # Preserve sentence structure by maintaining proper spacing
    # Replace multiple newlines with double newline (paragraph break)
    text = re.sub(r'\n\s*\n+', '\n\n', text)

    # Ensure sentences are properly spaced (but don't add extra spaces between words)
    text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)

    # Clean up excessive spaces while preserving single spaces
    text = re.sub(r'[ \t]+', ' ', text)

    # Remove leading/trailing whitespace from each line but preserve line breaks
    lines = [line.strip() for line in text.split('\n')]
    text = '\n'.join(line for line in lines if line)

    return text.strip()

# Robust CSV saving function
def save_data_robust(data, csv_path, existing_df=None):
    """Save data to CSV with better handling of large text fields"""
    if not data:
        return existing_df

    try:
        # Convert to DataFrame
        new_df = pd.DataFrame(data)

        # Handle text length - warn about very long entries
        for idx, row in new_df.iterrows():
            text_len = len(str(row['text'])) if pd.notna(row['text']) else 0
            if text_len > 50000:
                print(f"Processing large text for {row['date']}: {text_len} characters")

        # Combine with existing data
        if existing_df is not None and len(existing_df) > 0:
            combined_df = pd.concat([existing_df, new_df], ignore_index=True)
        else:
            combined_df = new_df

        # Save with better CSV handling
        combined_df.to_csv(csv_path, index=False, quoting=csv.QUOTE_ALL, escapechar='\\')
        print(f"Successfully saved {len(new_df)} new records to {csv_path} (total: {len(combined_df)})")

        return combined_df

    except Exception as e:
        print(f"Error saving to CSV with pandas: {e}")
        print("Attempting fallback CSV save method...")

        # Fallback: save using pure CSV writer
        try:
            with open(csv_path, 'w', newline='', encoding='utf-8') as f:
                writer = csv.writer(f, quoting=csv.QUOTE_ALL)
                writer.writerow(['id', 'date', 'source_url', 'text'])

                if existing_df is not None and len(existing_df) > 0:
                    for _, row in existing_df.iterrows():
                        writer.writerow([row['id'], row['date'], row['source_url'], row['text']])

                for item in data:
                    writer.writerow([item['id'], item['date'], item['source_url'], item['text']])

            print(f"Fallback save successful: {len(data)} new records")
            # Read back to return as DataFrame
            return pd.read_csv(csv_path)
        except Exception as e2:
            print(f"Fallback save also failed: {e2}")
            return existing_df

# Enhanced function to extract main content
def extract_main_content(soup):
    """
    Extract main content from BeautifulSoup object using multiple strategies
    Returns text or None if extraction fails
    """

    # Strategy 1: Try specific content containers (most reliable)
    content_selectors = [
        ('div', {'id': 'article'}),
        ('div', {'class': 'col-xs-12 col-sm-8 col-md-8'}),
        ('div', {'class': 'panel panel-default'}),
        ('div', {'class': 'row'}),
        ('article', {}),
        ('main', {}),
    ]

    for tag, attrs in content_selectors:
        content = soup.find(tag, attrs)
        if content:
            # Remove navigation, footer, and other non-content elements
            for element in content.find_all(['nav', 'header', 'footer', 'script', 'style']):
                element.decompose()

            text = content.get_text(separator=' ', strip=False)
            if len(text) > 500:  # Meaningful content threshold
                return text

    # Strategy 2: Find the largest text block (for cases where structure varies)
    all_divs = soup.find_all(['div', 'article', 'section'])
    max_text = ""
    max_length = 0

    for div in all_divs:
        # Skip navigation and footer areas
        div_class = ' '.join(div.get('class', [])).lower()
        div_id = div.get('id', '').lower()

        if any(skip in div_class + div_id for skip in ['nav', 'header', 'footer', 'sidebar', 'menu']):
            continue

        # Remove scripts and styles
        for element in div.find_all(['script', 'style', 'nav', 'header', 'footer']):
            element.decompose()

        text = div.get_text(separator=' ', strip=False)
        if len(text) > max_length:
            max_length = len(text)
            max_text = text

    if max_length > 500:
        return max_text

    # Strategy 3: Get all paragraphs (fallback)
    paragraphs = soup.find_all('p')
    if len(paragraphs) > 5:  # If there are enough paragraphs
        text = ' '.join([p.get_text(separator=' ', strip=False) for p in paragraphs])
        if len(text) > 500:
            return text

    return None

# Function to scrape minutes for a given date
def scrape_minutes(date):
    date_obj = datetime.strptime(date, '%Y-%m-%d')
    year = date_obj.year
    date_str = date_obj.strftime('%Y%m%d')

    # Define URL formats
    url_old = f'https://www.federalreserve.gov/fomc/minutes/{date_str}.htm'
    url_new = f'https://www.federalreserve.gov/monetarypolicy/fomcminutes{date_str}.htm'
    url_alt = f'https://www.federalreserve.gov/monetarypolicy/fomc{date_str}.htm'

    # For 2000-2003, use url_old; for 2004-2007, try both url_old and url_new;
    # for 2008, try url_alt then url_new; for >2008, use url_new
    if year < 2004:
        urls_to_try = [url_old]
    elif year <= 2007:
        urls_to_try = [url_old, url_new]
    elif year == 2008:
        urls_to_try = [url_alt, url_new, url_old]
    else:
        urls_to_try = [url_new]

    for url in urls_to_try:
        try:
            response = session.get(url, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html5lib')  # Use html5lib for better parsing

            # Extract main content using enhanced function
            text = extract_main_content(soup)

            if text:
                # Clean the text while preserving structure
                text = clean_text(text)

                if len(text) > 500:  # Reasonable minimum for FOMC minutes
                    print(f"✓ Success: Scraped {len(text)} characters for {date} from {url}")
                    return {
                        'id': f"minutes_{date.replace('-', '')}",
                        'date': date,
                        'source_url': url,
                        'text': text
                    }
                else:
                    print(f"⚠ Warning: Text too short ({len(text)} chars) for {date} at {url}")
            else:
                print(f"⚠ No content extracted for {date} at {url}")

        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 404:
                print(f"✗ 404 Not Found: {url}")
            else:
                print(f"✗ HTTP Error {e.response.status_code}: {url}")
        except requests.exceptions.RequestException as e:
            print(f"✗ Request error for {date} at {url}: {e}")
        except Exception as e:
            print(f"✗ General error for {date} at {url}: {e}")

    print(f"✗ Failed to scrape minutes for {date} (tried {len(urls_to_try)} URLs)")
    return None

# Scrape and save data
data = []
successful_scrapes = 0
failed_scrapes = 0
scrape_sizes = []  # Track sizes for reporting

print(f"\nStarting scrape of {len(fomc_dates)} FOMC meeting dates...")
print(f"{'=' * 60}\n")

for idx, date in enumerate(fomc_dates, 1):
    print(f"[{idx}/{len(fomc_dates)}] Processing {date}...")

    if date in existing_dates:
        print(f"→ Skipping {date}: already in CSV\n")
        continue

    result = scrape_minutes(date)
    if result:
        data.append(result)
        successful_scrapes += 1
        scrape_sizes.append({'date': date, 'size': len(result['text'])})
    else:
        failed_scrapes += 1

    print()  # Blank line for readability

    # Save incrementally every 10 dates or at the end
    if len(data) >= 10 or (date == fomc_dates[-1] and data):
        existing_df = save_data_robust(data, csv_path, existing_df)
        data = []  # Clear data after saving

    time.sleep(1)  # Avoid overwhelming the server

# Print summary
print(f"\n{'=' * 60}")
print(f"SCRAPING COMPLETE")
print(f"{'=' * 60}")
print(f"✓ Successful: {successful_scrapes}")
print(f"✗ Failed: {failed_scrapes}")
print(f"Total processed: {successful_scrapes + failed_scrapes}")

if os.path.exists(csv_path):
    try:
        final_df = pd.read_csv(csv_path)
        print(f"\nFinal CSV contains {len(final_df)} records")
        print(f"File size: {os.path.getsize(csv_path) / (1024 * 1024):.2f} MB")

        # Show the 10 shortest scrapes
        if scrape_sizes:
            print(f"\n{'=' * 60}")
            print(f"10 SHORTEST SCRAPES FROM THIS SESSION")
            print(f"{'=' * 60}")
            sorted_sizes = sorted(scrape_sizes, key=lambda x: x['size'])[:10]
            for i, item in enumerate(sorted_sizes, 1):
                print(f"{i:2}. {item['date']}: {item['size']:,} characters")

        # Show any remaining missing dates
        scraped_dates = set(final_df['date'])
        missing_dates = [d for d in fomc_dates if d not in scraped_dates]
        if missing_dates:
            print(f"\n⚠ Still missing {len(missing_dates)} dates:")
            for d in missing_dates[:10]:  # Show first 10
                print(f"  - {d}")
            if len(missing_dates) > 10:
                print(f"  ... and {len(missing_dates) - 10} more")
    except Exception as e:
        print(f"Error reading final CSV: {e}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Processing all available years
No existing CSV found, starting fresh

Starting scrape of 199 FOMC meeting dates...

[1/199] Processing 2000-02-02...
✓ Success: Scraped 42297 characters for 2000-02-02 from https://www.federalreserve.gov/fomc/minutes/20000202.htm

[2/199] Processing 2000-03-21...
✓ Success: Scraped 28608 characters for 2000-03-21 from https://www.federalreserve.gov/fomc/minutes/20000321.htm

[3/199] Processing 2000-05-16...
✓ Success: Scraped 25577 characters for 2000-05-16 from https://www.federalreserve.gov/fomc/minutes/20000516.htm

[4/199] Processing 2000-06-28...
✓ Success: Scraped 27408 characters for 2000-06-28 from https://www.federalreserve.gov/fomc/minutes/20000628.htm

[5/199] Processing 2000-08-22...
✓ Success: Scraped 26877 characters for 2000-08-22 from https://www.federalreserve.gov/fomc/minutes/20000822.htm

[6/199] Processing 2

In [None]:
#@title Additional Clean
import pandas as pd
import re

input_file = '/content/drive/MyDrive/FedComs/Minutes/fomc_minutes.csv'
output_dir = '/content/drive/MyDrive/FedComs/Minutes'

print("Reading FOMC minutes...")
df = pd.read_csv(input_file)
print(f"Loaded {len(df)} minutes")

df['date'] = pd.to_datetime(df['date'])

def fix_text_encoding(text):
    """Fix common text encoding issues from web scraping."""
    text = text.replace('â', '—')
    text = text.replace('â', '—')
    text = text.replace('â', '"')
    text = text.replace('â', '"')
    text = text.replace('\u2013', '–')
    text = text.replace('\u2014', '—')
    text = text.replace('\u2018', "'")
    text = text.replace('\u2019', "'")
    text = text.replace('\u201c', '"')
    text = text.replace('\u201d', '"')
    text = text.replace('\u2026', '...')
    text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]', '', text)
    return text

def is_in_footnote(text, match_position):
    """Check if a match position is within a footnote."""
    start_check = max(0, match_position - 500)
    text_before = text[start_check:match_position]
    text_after = text[match_position:match_position + 200]

    if re.search(r'Return to text', text_before, re.IGNORECASE):
        if not re.search(r'Return to text', text_after, re.IGNORECASE):
            return True

    if re.search(r'\[\d+\]', text_before):
        return True

    if re.search(r'\d+\.\s+[A-Z]', text_before[-100:]):
        return True

    return False

def clean_text_minutes(text, date):
    """Clean text for FOMC minutes based on date."""
    text = fix_text_encoding(text)

    if date == pd.Timestamp('2021-11-03'):
        start_pattern = r'The manager turned first to a discussion'
        match = re.search(start_pattern, text, re.IGNORECASE)
        if match:
            sentence_start = text.rfind('.', 0, match.start())
            if sentence_start == -1:
                sentence_start = 0
            else:
                sentence_start += 1
            text = text[sentence_start:].strip()
    elif date < pd.Timestamp('2019-01-01'):
        pattern = r'Manager of the System Open'
        matches = list(re.finditer(pattern, text, re.IGNORECASE))
        if matches:
            match = matches[0]
            sentence_end = text.find('.', match.end())
            if sentence_end != -1:
                text = text[sentence_end + 1:].strip()
    else:
        pattern = r'Developments in Financial Markets and Open Market Operations'
        matches = list(re.finditer(pattern, text, re.IGNORECASE))
        if matches:
            selected_match = None
            for match in matches:
                if not is_in_footnote(text, match.start()):
                    selected_match = match
                    break
            if selected_match is None:
                selected_match = matches[0]
            sentence_end = text.find('.', selected_match.end())
            if sentence_end != -1:
                text = text[sentence_end + 1:].strip()

    end_pattern = r'meeting adjourned'
    match = re.search(end_pattern, text, re.IGNORECASE)
    if match:
        sentence_start = text.rfind('.', 0, match.start())
        if sentence_start == -1:
            sentence_start = 0
        text = text[:sentence_start].strip()

    return text.strip()

print("\nCleaning minutes...")
cleaned_texts = []

for idx, row in df.iterrows():
    if idx % 20 == 0:
        print(f"Processing minutes {idx+1}/{len(df)}...")
    date = row['date']
    text = row['text']
    cleaned_text = clean_text_minutes(text, date)
    cleaned_texts.append(cleaned_text)

df_cleaned = df.copy()
df_cleaned['text'] = cleaned_texts
df_cleaned['date'] = df_cleaned['date'].dt.strftime('%Y-%m-%d')

output_file = os.path.join(output_dir, 'fomc_minutes_cleaned.csv')
df_cleaned.to_csv(output_file, index=False)

print(f"\nCleaned minutes saved to: {output_file}")
print(f"Total minutes: {len(df_cleaned)}")

print("\n" + "="*70)
print("CLEANING STATISTICS")
print("="*70)

original_lengths = df['text'].str.len()
cleaned_lengths = df_cleaned['text'].str.len()

print(f"\nAverage original text length: {original_lengths.mean():.0f} characters")
print(f"Average cleaned text length: {cleaned_lengths.mean():.0f} characters")
print(f"Average reduction: {(original_lengths.mean() - cleaned_lengths.mean()):.0f} characters ({((1 - cleaned_lengths.mean()/original_lengths.mean())*100):.1f}%)")

print("\n" + "="*70)
print("EXAMPLES OF CLEANED TEXT")
print("="*70)

pre_2019_example = df_cleaned[df_cleaned['date'] < '2019-01-01'].iloc[-1] if len(df_cleaned[df_cleaned['date'] < '2019-01-01']) > 0 else None
if pre_2019_example is not None:
    print(f"\nPRE-2019 PERIOD")
    print(f"Date: {pre_2019_example['date']}")
    print(f"First 200 characters: {pre_2019_example['text'][:200]}...")

post_2019_example = df_cleaned[df_cleaned['date'] >= '2019-01-01'].iloc[0] if len(df_cleaned[df_cleaned['date'] >= '2019-01-01']) > 0 else None
if post_2019_example is not None:
    print(f"\n2019 ONWARD PERIOD")
    print(f"Date: {post_2019_example['date']}")
    print(f"First 200 characters: {post_2019_example['text'][:200]}...")

print("\n" + "="*70)
print("SHORTEST 5 ENTRIES (FOR INSPECTION)")
print("="*70)

df_cleaned['text_length'] = df_cleaned['text'].str.len()
shortest_5 = df_cleaned.nsmallest(5, 'text_length')[['id', 'date', 'text_length', 'text']]

for idx, row in shortest_5.iterrows():
    print(f"\n{'-'*70}")
    print(f"ID: {row['id']}")
    print(f"Date: {row['date']}")
    print(f"Length: {row['text_length']} characters")
    print(f"\nFull text:")
    print(row['text'][:1000])
    if row['text_length'] > 1000:
        print(f"\n... (truncated, {row['text_length'] - 1000} more characters)")

print("\n" + "="*70)
print("Cleaning complete!")

Reading FOMC minutes...
Loaded 199 minutes

Cleaning minutes...
Processing minutes 1/199...
Processing minutes 21/199...
Processing minutes 41/199...
Processing minutes 61/199...
Processing minutes 81/199...
Processing minutes 101/199...
Processing minutes 121/199...
Processing minutes 141/199...
Processing minutes 161/199...
Processing minutes 181/199...

Cleaned minutes saved to: /content/drive/MyDrive/FedComs/Minutes/fomc_minutes_cleaned.csv
Total minutes: 199

CLEANING STATISTICS

Average original text length: 47193 characters
Average cleaned text length: 39751 characters
Average reduction: 7442 characters (15.8%)

EXAMPLES OF CLEANED TEXT

PRE-2019 PERIOD
Date: 2018-12-19
First 200 characters: Minutes of the Federal Open Market Committee
December 18-19, 2018
A joint meeting of the Federal Open Market Committee and the Board of Governors was held in the offices of the Board of Governors of t...

2019 ONWARD PERIOD
Date: 2019-01-30
First 200 characters: S. and global financial marke