In [14]:
from google.colab import drive
drive.mount('/content/drive')

import os
output_dir = '/content/drive/MyDrive/Transcripts'

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

os.chdir(output_dir)

# Verify the current working directory
print(f"Current working directory: {os.getcwd()}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Current working directory: /content/drive/MyDrive/Transcripts


In [None]:
!pip install pdfplumber pandas requests beautifulsoup4 python-dateutil PyPDF2 -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m82.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [17]:
#@ Updated version with improved text cleaning and contamination prevention

import re
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from dateutil import parser
from urllib.parse import urljoin
import datetime
import os
from collections import defaultdict
import io
import PyPDF2
import pdfplumber
from difflib import SequenceMatcher

# Updated URLs and constants
FED_BASE_URL = "https://www.federalreserve.gov"
FOMC_HISTORICAL_BASE = "https://www.federalreserve.gov/monetarypolicy/fomchistorical"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/100.0.4896.127 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Connection": "keep-alive",
}

# Define officials with their roles and tenures (EXPANDED - includes Laurence Meyer and others)
officials_text = """Donald Kohn, Governor, 2002-2006
Donald Kohn, Vice Chair, 2006-2010
Alan Greenspan, Chair, 1987-2006
Ben Bernanke, Chair, 2006-2014
Janet Yellen, Vice Chair, 2010-2014
Janet Yellen, Chair, 2014-2018
Jerome Powell, Governor, 2012-2018
Jerome Powell, Chair, 2018-2024
Randall Kroszner, Governor, 2006-2009
Kevin Warsh, Governor, 2006-2011
Elizabeth Duke, Governor, 2008-2013
Daniel Tarullo, Governor, 2009-2017
Sarah Bloom Raskin, Governor, 2010-2014
Lael Brainard, Governor, 2014-2022
Lael Brainard, Vice Chair, 2022-2023
Stanley Fischer, Vice Chair, 2014-2017
Jeremy Stein, Governor, 2012-2014
Michelle Bowman, Governor, 2018-2024
Richard Clarida, Vice Chair, 2018-2022
Randal Quarles, Governor, 2017-2021
Christopher Waller, Governor, 2020-2024
Michael Barr, Governor, 2022-2024
Lisa Cook, Governor, 2022-2024
Adriana Kugler, Governor, 2023-2025
Philip Jefferson, Governor, 2022-2023
Philip Jefferson, Vice Chair, 2023-2025
Susan S. Bies, Governor, 2001-2007
Mark W. Olson, Governor, 2001-2006
Roger W. Ferguson, Vice Chair, 1997-2006
Edward M. Gramlich, Governor, 1997-2005
Frederic S. Mishkin, Governor, 2006-2008
Laurence Meyer, Governor, 1996-2002
Alice Rivlin, Vice Chair, 1996-1999
Cathy E. Minehan, President, Boston, 1994-2007
Eric S. Rosengren, Boston, 2007-2021
Susan M. Collins, Boston, 2022-2024
William McDonough, New York, 1993-2003
William C. Dudley, New York, 2009-2018
Timothy Geithner, New York, 2003-2009
John C. Williams, New York, 2011-2024
Edward G. Boehne, Philadelphia, 1981-2000
Anthony M. Santomero, Philadelphia, 2000-2006
Charles I. Plosser, Philadelphia, 2006-2015
Patrick T. Harker, Philadelphia, 2015-2024
Jerry Jordan, Cleveland, 1992-2003
Sandra Pianalto, Cleveland, 2003-2014
Loretta Mester, Cleveland, 2014-2024
Alfred Broaddus, Richmond, 1993-2004
Jeffrey M. Lacker, Richmond, 2004-2017
Thomas I. Barkin, Richmond, 2018-2024
Jack Guynn, Atlanta, 1996-2006
Dennis P. Lockhart, Atlanta, 2007-2017
Raphael Bostic, Atlanta, 2017-2024
Michael Moskow, Chicago, 1994-2007
Charles L. Evans, Chicago, 2007-2023
Austan D. Goolsbee, Chicago, 2023-2024
William Poole, St. Louis, 1998-2008
James Bullard, St. Louis, 2008-2023
Alberto G. Musalem, St. Louis, 2024-2024
Gary H. Stern, Minneapolis, 1985-2009
Narayana Kocherlakota, Minneapolis, 2009-2015
Neel Kashkari, Minneapolis, 2016-2024
Thomas M. Hoenig, Kansas City, 1991-2011
Esther L. George, Kansas City, 2011-2023
Jeffrey R. Schmid, Kansas City, 2023-2024
Robert D. McTeer, Dallas, 1991-2004
Richard W. Fisher, Dallas, 2005-2015
Robert Steven Kaplan, Dallas, 2015-2021
Robert T. Parry, San Francisco, 1986-2004
Janet Yellen, San Francisco, 2004-2010
John Williams, San Francisco, 2011-2018
Mary C. Daly, San Francisco, 2018-2024
Lorie K. Logan, Dallas, 2022-2024"""

def parse_officials(officials_text):
    """Parse the officials text into a structured dictionary"""
    officials = []

    for line in officials_text.strip().split('\n'):
        line = line.strip()
        if not line:
            continue

        parts = [p.strip() for p in line.split(',')]

        if len(parts) >= 3:
            name = parts[0]
            role = parts[1]
            years = parts[2]

            year_match = re.search(r'(\d{4})-(\d{4})', years)
            if year_match:
                start_year = int(year_match.group(1))
                end_year = int(year_match.group(2))

                name_parts = name.split()
                last_name = name_parts[-1].upper() if name_parts else ""
                first_name = name_parts[0].upper() if name_parts else ""

                officials.append({
                    'name': name,
                    'role': role,
                    'start_year': start_year,
                    'end_year': end_year,
                    'last_name': last_name,
                    'first_name': first_name,
                    'full_name_upper': name.upper()
                })

    print(f"DEBUG: Total officials parsed: {len(officials)}")
    return officials

def generate_transcript_id(official_name, date_str, sequence_num):
    """Generate transcript ID in format: FirstInitialLastName_YYYYMMDD_transcript_N"""
    name_parts = official_name.split()
    if len(name_parts) >= 2:
        first_initial = name_parts[0][0].upper()
        last_name = name_parts[-1]
        name_part = f"{first_initial}{last_name}"
    else:
        name_part = official_name.replace(' ', '')

    try:
        if isinstance(date_str, str):
            date_obj = parser.parse(date_str)
        else:
            date_obj = date_str
        date_part = date_obj.strftime('%Y%m%d')
    except:
        date_part = "00000000"

    return f"{name_part}_{date_part}_transcript_{sequence_num}"

def fetch_page(url, headers, max_retries=3, timeout=30):
    """Fetch a web page with retry logic"""
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=timeout)
            if response.ok:
                return response.content
            else:
                print(f"Failed to fetch {url}: Status code {response.status_code}")
                time.sleep(1)
        except (requests.RequestException, ConnectionError) as e:
            print(f"Attempt {attempt+1}/{max_retries} failed for {url}: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)

    print(f"Request failed after multiple retries for {url}")
    return None

def find_transcript_pdfs_for_year(year):
    """Find all transcript PDF links for a given year - meeting transcripts AND conference calls"""
    transcript_links = []
    year_url = f"{FOMC_HISTORICAL_BASE}{year}.htm"

    print(f"Searching for transcripts on {year_url}")

    html_content = fetch_page(year_url, HEADERS)
    if not html_content:
        print(f"Could not fetch content for {year}")
        return transcript_links

    soup = BeautifulSoup(html_content, "html.parser")

    for link in soup.find_all('a'):
        href = link.get('href', '')
        text = link.text.lower()

        # Include: meeting transcripts OR confcall documents
        # Exclude: support documents (beige book, greenbook, etc.)
        if (href.lower().endswith('.pdf') and
            ('meeting' in href.lower() or 'confcall' in href.lower()) and
            not any(doc_type in href.lower() for doc_type in
                   ['beige', 'bluebook', 'greenbook', 'gbpt', 'gbsup', 'agenda', 'material'])):

            if not href.startswith('http'):
                href = urljoin(FED_BASE_URL, href)

            meeting_date = extract_date_from_pdf_url(href)
            if meeting_date:
                # Determine document type
                is_confcall = 'confcall' in href.lower()
                doc_type = "Unscheduled (Conference Call)" if is_confcall else "Regular"

                transcript_links.append({
                    'url': href,
                    'date': meeting_date,
                    'meeting_type': doc_type  # Add meeting type
                })
                print(f"Found {doc_type} transcript for {meeting_date.strftime('%Y-%m-%d')}: {href}")

    print(f"Total transcripts (meetings + confcalls) found for {year}: {len(transcript_links)}")
    return transcript_links

def extract_date_from_pdf_url(url):
    """Extract meeting date from PDF URL with multiple patterns"""
    date_match = re.search(r'FOMC(\d{8})', url)
    if date_match:
        date_str = date_match.group(1)
        try:
            return datetime.datetime.strptime(date_str, '%Y%m%d')
        except ValueError:
            pass

    date_patterns = [
        r'(\d{4})(\d{2})(\d{2})',
        r'(\d{2})-(\d{2})-(\d{4})',
        r'(\d{4})-(\d{2})-(\d{2})',
    ]

    for pattern in date_patterns:
        match = re.search(pattern, url)
        if match:
            try:
                groups = match.groups()
                if len(groups) == 3:
                    if len(groups[0]) == 4:
                        year, month, day = int(groups[0]), int(groups[1]), int(groups[2])
                    else:
                        month, day, year = int(groups[0]), int(groups[1]), int(groups[2])
                    return datetime.datetime(year, month, day)
            except ValueError:
                continue

    year_match = re.search(r'/(\d{4})/', url)
    if year_match:
        year = int(year_match.group(1))
        return datetime.datetime(year, 1, 1)

    return None

def extract_text_from_pdf(pdf_content):
    """Extract text from PDF using available libraries"""
    try:
        with pdfplumber.open(io.BytesIO(pdf_content)) as pdf:
            text = ""
            for page in pdf.pages:
                extracted = page.extract_text()
                if extracted:
                    text += extracted + "\n"
            return text
    except Exception as e:
        print(f"Error with pdfplumber extraction: {e}")

    try:
        pdf_file = io.BytesIO(pdf_content)
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"

        return text
    except Exception as e:
        print(f"Error extracting PDF text with PyPDF2: {e}")

    return None

def clean_extracted_text(text):
    """
    COMPREHENSIVE: Removes ALL page numbers and fixes PDF extraction artifacts
    Fixes: Page numbers (9 patterns), mid-sentence line breaks, preserves data
    """
    if not text:
        return text

    # Fix common encoding issues
    text = text.replace('â€"', '—')
    text = text.replace('â€™', "'")
    text = text.replace('â€œ', '"')
    text = text.replace('â€', '"')

    # Protect "U.S." temporarily
    text = re.sub(r'U\.S\.', 'U_S_', text)

    # ===== REMOVE ALL PAGE NUMBER PATTERNS =====

    # 1. Standalone in middle (between newlines)
    text = re.sub(r'\n\s*\d{1,3}\s*\n', '\n', text)

    # 2. At start of line before capital letter
    text = re.sub(r'\n\s*\d{1,3}\s+(?=[A-Z])', '\n', text)

    # 3. At very beginning of text
    text = re.sub(r'^\s*\d{1,3}\s*\n', '', text)

    # 4. At end of sentences
    text = re.sub(r'\.\s*\d{1,3}\s*\n', '.\n', text)

    # 5. "Page X of Y" patterns
    text = re.sub(r'\bPage\s+\d+\s+of\s+\d+\b', '', text, flags=re.IGNORECASE)

    # 6. "X of Y" standalone
    text = re.sub(r'\n\s*\d+\s+of\s+\d+\s*\n', '\n', text)

    # 7. After commas/semicolons at line breaks
    text = re.sub(r'([,;])\s*\n\s*\d{1,3}\s+', r'\1 ', text)

    # 8. DASH-FORMATTED page numbers (e.g., "\n- 19 -\n")
    text = re.sub(r'\n\s*-\s*\d{1,3}\s*-\s*\n', '\n', text)

    # 9. At END of entire text (critical!)
    text = re.sub(r'\n\s*\d{1,3}\s*$', '', text)

    # ===== FIX MID-SENTENCE LINE BREAKS (PDF ARTIFACTS) =====

    # Remove line breaks between lowercase and any letter (lowercase OR uppercase)
    # Pattern 1: lowercase to lowercase (e.g., "develop\ning")
    text = re.sub(r'([a-z])\n([a-z])', r'\1 \2', text)

    # Pattern 2: lowercase to uppercase (e.g., "a\nTreasuries")
    # This is common with proper nouns mid-sentence
    text = re.sub(r'([a-z])\n([A-Z])', r'\1 \2', text)

    # Pattern 3: uppercase to lowercase (e.g., "A\nfew", "I\ncan")
    # This is common with single-letter words like "A" or "I"
    text = re.sub(r'([A-Z])\n([a-z])', r'\1 \2', text)

    # Pattern 4: period/digit followed by newline then lowercase (mid-sentence)
    # e.g., "3.\npercent" → "3. percent"
    # If lowercase follows, it's NOT a new sentence
    text = re.sub(r'([.0-9])\n([a-z])', r'\1 \2', text)

    # Remove line breaks after commas/semicolons when followed by lowercase
    text = re.sub(r'([,;])\n(?=[a-z])', r'\1 ', text)

    # ===== REMOVE DATE/TIME HEADERS =====

    # Date headers with page numbers (handle en dash, em dash, and hyphen in date ranges)
    text = re.sub(
        r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:[-–—]\d{1,2})?,\s+\d{4}\s+\d+\s+of\s+\d+',
        '', text
    )
    text = re.sub(r'\d{1,2}/\d{1,2}(?:[-–—]\d{1,2})?/\d{2}\s+\d+', '', text)
    text = re.sub(
        r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}--(?:Morning|Afternoon|Evening)\s+Session',
        '', text, flags=re.IGNORECASE
    )

    # ===== CLEAN UP SPEAKER ARTIFACTS =====

    text = re.sub(r'SPEAKER\(\?\)\.?\s*', '', text, flags=re.IGNORECASE)

    # ===== CLEAN UP PDF ARTIFACTS =====

    text = re.sub(r'\bFOMC\s+Meeting\s+Transcript\b', '', text, flags=re.IGNORECASE)

    # ===== FIX SENTENCE FORMATTING =====

    # Add line break after periods followed by capital letters
    text = re.sub(r'\.(?!\s*U_S_)(?!\s*Mr)(?!\s*Ms)(?!\s*Dr)([A-Z])', r'.\n\1', text)

    # Restore U.S.
    text = text.replace('U_S_', 'U.S.')

    # ===== CLEAN UP WHITESPACE =====

    text = re.sub(r'\n +', '\n', text)
    text = re.sub(r' +\n', '\n', text)
    text = re.sub(r' {2,}', ' ', text)
    text = re.sub(r'\n{3,}', '\n\n', text)

    return text.strip()


def clean_page_numbers_from_text(text):
    """
    IMPROVED: Remove ALL page number patterns including inline numbers
    Supplemental cleaning after extraction
    """
    if not text:
        return text

    # Pattern 1: Standalone digits between newlines
    text = re.sub(r'\n\s*\d{1,2}\s*\n', '\n', text)

    # Pattern 2: CRITICAL - Digit at start of line followed by space and text (inline page numbers)
    # This catches: "\n7 to fill some orders" → "\nto fill some orders"
    text = re.sub(r'\n\s*\d{1,2}\s+(?=[a-z])', '\n', text)

    # Pattern 3: Digit at start of line followed by capital letter
    text = re.sub(r'\n\s*\d{1,2}\s+(?=[A-Z])', '\n', text)

    # Pattern 4: At very beginning/end
    text = re.sub(r'^\s*\d{1,2}\s*\n', '', text)
    text = re.sub(r'\n\s*\d{1,2}\s*$', '', text)

    # Pattern 5: Dash-formatted page numbers
    text = re.sub(r'\s*-\s*\d{1,3}\s*-\s*', ' ', text)
    text = re.sub(r'\n\s*-\s*\d{1,3}\s*-\s*\n', '\n', text)

    # Pattern 6: "X of Y" patterns
    text = re.sub(r'\b\d+\s+of\s+\d+\b', '', text, flags=re.IGNORECASE)

    # Pattern 7: "Page X" patterns
    text = re.sub(r'\bPage\s+\d+', '', text, flags=re.IGNORECASE)

    return text


def validate_and_truncate_at_embedded_speaker(text, current_speaker):
    """
    IMPROVED: Detect embedded speakers including inline formats

    Uses ALL CAPS detection to find when another speaker's text begins.
    This prevents Speaker A's text from containing Speaker B's speech.

    Now catches:
    - Speakers on their own line: "\nMR. KOS.1\nThank you..."
    - Speakers inline: "\nMR. KOS.1 Thank you..." (NO newline after name)
    """

    # IMPROVED patterns that don't require newline at the end
    embedded_patterns = [
        # Pattern 1: Title + ALL CAPS name with footnote, followed by space OR newline
        # Matches: "\nMR. KOS.1 Thank you" OR "\nMR. KOS.1\nThank you"
        r'\n\s*((?:MR|MS|DR|CHAIRMAN|CHAIR|VICE CHAIRMAN|VICE CHAIR|GOVERNOR|PRESIDENT)\.?\s+[A-Z]{2,}(?:\s+[A-Z]+)*)\.?\d*\.?\s',

        # Pattern 2: Just ALL CAPS name with footnote, followed by space OR newline
        # Matches: "\nKOS.1 Thank you" OR "\nREINHART.8 I'm fairly"
        r'\n\s*([A-Z]{2,}(?:\s+[A-Z]+)*)\.?\d*\.?\s',
    ]

    # Normalize current speaker name for comparison
    current_upper = current_speaker.upper()
    for prefix in ['CHAIRMAN ', 'VICE CHAIR ', 'VICE CHAIRMAN ', 'GOVERNOR ', 'PRESIDENT ', 'MR. ', 'MS. ', 'DR. ']:
        current_upper = current_upper.replace(prefix, '')
    current_upper = current_upper.strip()

    earliest_contamination = None
    contaminating_speaker = None

    for pattern in embedded_patterns:
        matches = re.finditer(pattern, text, re.MULTILINE)

        for match in matches:
            embedded_speaker_raw = match.group(1).strip()

            # Clean footnotes: "KOS.1" → "KOS"
            embedded_speaker = re.sub(r'\.?\d+$', '', embedded_speaker_raw).strip().rstrip('.')

            embedded_upper = embedded_speaker.upper()

            # Skip if same speaker
            if embedded_upper in current_upper or current_upper in embedded_upper:
                continue

            # Skip common non-speakers
            non_speakers = [
                'FEDERAL RESERVE', 'BOARD', 'MEETING', 'TRANSCRIPT', 'PAGE',
                'APPENDIX', 'EXHIBIT', 'TABLE', 'JANUARY', 'FEBRUARY', 'MARCH',
                'APRIL', 'MAY', 'JUNE', 'JULY', 'AUGUST', 'SEPTEMBER', 'OCTOBER',
                'NOVEMBER', 'DECEMBER', 'FOMC', 'GDP', 'CPI', 'NOTES', 'NOTE',
                'SEVERAL'  # Common in transcripts when multiple people speak
            ]

            if any(word in embedded_upper for word in non_speakers):
                continue

            # Check if at start of line (not mid-sentence reference)
            before_match = text[:match.start()]
            last_newline = before_match.rfind('\n')

            if last_newline != -1:
                text_on_line = before_match[last_newline+1:].strip()
            else:
                text_on_line = before_match.strip()

            # If substantial text before on same line, it's likely a reference
            # But be more lenient - even short text like "please." before speaker is OK
            if len(text_on_line) > 30:
                continue

            # Found genuine speaker transition
            if earliest_contamination is None or match.start() < earliest_contamination:
                earliest_contamination = match.start()
                contaminating_speaker = embedded_speaker
                print(f"⚠ WARNING: Found '{embedded_speaker}' in '{current_speaker}' segment at position {match.start()}")

    # Truncate if contamination found
    if earliest_contamination is not None:
        original_length = len(text)
        text = text[:earliest_contamination].strip()
        removed = original_length - len(text)
        print(f"  → Truncated {removed} chars from '{current_speaker}' (contaminated by '{contaminating_speaker}')")

    return text


def extract_speakers_and_statements(transcript_text):
    """
    IMPROVED: Extract speaker segments with better inline speaker detection

    Key improvements:
    - Handles footnote markers (REINHART.8 → REINHART)
    - Uses ALL CAPS for reliable speaker detection
    - Detects inline speakers (no newline required after name)
    - Validates segments to prevent contamination
    - Better page number cleaning
    """

    # IMPROVED patterns - removed requirement for newline at end
    patterns = [
        # Pattern 1: Title + Name with optional footnote, followed by space/newline
        # Matches: "CHAIRMAN GREENSPAN." OR "MR. REINHART.8 I'm..."
        r'\n\s*((?:MR|MS|DR|CHAIRMAN|CHAIR|VICE CHAIRMAN|VICE CHAIR|GOVERNOR|PRESIDENT)\.?\s+[A-Z]+(?:\s+[A-Z]+)*)\.?\d*\.?\s+',

        # Pattern 2: ALL CAPS name with optional footnote, followed by space/newline
        # Matches: "GREENSPAN." OR "REINHART.8 Thank you"
        r'\n\s*([A-Z]{2,}(?:\s+[A-Z]{2,})*)\.?\d*\.?\s+',

        # Pattern 3: Colon format
        r'\n\s*([A-Z]{2,}(?:\s+[A-Z]{2,})*):?\s+',
    ]

    all_matches = []

    for i, pattern in enumerate(patterns):
        matches = re.finditer(pattern, transcript_text, re.MULTILINE)
        for match in matches:
            speaker_raw = match.group(1).strip()

            # CRITICAL: Remove footnote numbers (e.g., "REINHART.8" → "REINHART")
            speaker_name = re.sub(r'\.?\d+$', '', speaker_raw).strip().rstrip('.')

            # Exclude non-speaker phrases
            excluded_phrases = [
                'FEDERAL RESERVE', 'BOARD OF GOVERNORS', 'MEETING', 'TRANSCRIPT',
                'PAGE', 'APPENDIX', 'EXHIBIT', 'TABLE', 'CHART', 'FIGURE',
                'JANUARY', 'FEBRUARY', 'MARCH', 'APRIL', 'MAY', 'JUNE',
                'JULY', 'AUGUST', 'SEPTEMBER', 'OCTOBER', 'NOVEMBER', 'DECEMBER',
                'MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY',
                'FOMC', 'GDP', 'CPI', 'NOTE', 'NOTES', 'ADDENDA', 'CLASS', 'SEVERAL'
            ]

            if (len(speaker_name) > 1 and
                not any(phrase in speaker_name for phrase in excluded_phrases) and
                not speaker_name.isdigit()):

                all_matches.append((match.start(), speaker_name, f"Pattern_{i+1}"))

    # Sort by position and remove duplicates
    all_matches = sorted(list(set(all_matches)), key=lambda x: x[0])

    print(f"DEBUG: Found {len(all_matches)} potential speakers")

    speakers_text = []

    for i in range(len(all_matches)):
        start_pos, speaker, pattern_used = all_matches[i]

        # Find where speaker marker ends
        speaker_end = transcript_text.find(speaker, start_pos) + len(speaker)
        text_start = speaker_end

        # Skip punctuation/whitespace/digits after speaker marker
        while text_start < len(transcript_text) and transcript_text[text_start] in '.:;\s\n\d':
            text_start += 1

        # Determine segment end
        end_pos = all_matches[i+1][0] if i < len(all_matches) - 1 else len(transcript_text)

        # Extract raw text
        text = transcript_text[text_start:end_pos].strip()

        # CRITICAL: Validate and truncate at embedded speakers
        text = validate_and_truncate_at_embedded_speaker(text, speaker)

        # Clean page numbers from extracted text
        text = clean_page_numbers_from_text(text)

        # Only keep substantial segments
        if speaker and text and len(text) > 20:
            speakers_text.append((speaker, text))

    return speakers_text


def similarity_score(str1, str2):
    """Calculate similarity between two strings (0.0 to 1.0)"""
    return SequenceMatcher(None, str1.upper(), str2.upper()).ratio()


def fuzzy_match_name(name1, name2, threshold=0.85):
    """Check if two names match with fuzzy tolerance for typos"""
    if name1.upper() == name2.upper():
        return True
    score = similarity_score(name1, name2)
    return score >= threshold


def match_speaker_to_official(speaker, officials, meeting_date):
    """
    IMPROVED: Enhanced speaker matching with fuzzy matching for typos
    Handles cases like GREENPAN → GREENSPAN, and "MR [LASTNAME]" patterns
    """
    speaker = speaker.strip().upper()
    cleaned_speaker = speaker

    # Remove common prefixes to get to the actual name
    prefixes_to_remove = [
        "CHAIRMAN ", "CHAIR ", "VICE CHAIRMAN ", "VICE CHAIR ",
        "GOVERNOR ", "PRESIDENT ", "MR. ", "MS. ", "DR. ", "THE ", "MR "
    ]

    for prefix in prefixes_to_remove:
        if cleaned_speaker.startswith(prefix):
            cleaned_speaker = cleaned_speaker[len(prefix):].strip()
            break

    # Try exact and fuzzy matching
    for official in officials:
        # Check if official was serving at this meeting date
        if not (official['start_year'] <= meeting_date.year <= official['end_year']):
            continue

        official_last = official['last_name']
        official_first = official['first_name']
        official_full = official['full_name_upper']

        # Strategy 1: FUZZY match on last name (handles typos like GREENPAN → GREENSPAN)
        if fuzzy_match_name(cleaned_speaker, official_last, threshold=0.85):
            print(f"✓ Matched: {speaker} -> {official['name']} (fuzzy: {similarity_score(cleaned_speaker, official_last):.2f})")
            return official

        # Strategy 2: Exact full name match
        official_full_clean = official_full.replace(',', '').replace('.', '')
        if cleaned_speaker == official_full_clean:
            print(f"✓ Matched: {speaker} -> {official['name']}")
            return official

        # Strategy 3: First + Last combinations
        name_combinations = [
            f"{official_first} {official_last}",
            f"{official_first[0]} {official_last}" if official_first else "",
            f"{official_first[0]}. {official_last}" if official_first else "",
        ]

        for combo in name_combinations:
            if combo and cleaned_speaker == combo:
                print(f"✓ Matched: {speaker} -> {official['name']}")
                return official

        # Strategy 4: Fuzzy match on full name (handles typos in full names)
        if fuzzy_match_name(cleaned_speaker, official_full_clean, threshold=0.85):
            print(f"✓ Matched: {speaker} -> {official['name']} (fuzzy full)")
            return official

        # Strategy 5: Handle compound names - check if last word matches
        speaker_parts = cleaned_speaker.split()
        if len(speaker_parts) >= 2:
            speaker_last_word = speaker_parts[-1]

            # Exact match on last word
            if speaker_last_word == official_last:
                speaker_first_part = speaker_parts[0]
                if (speaker_first_part == official_first or
                    speaker_first_part == official_first[0] if official_first else False):
                    print(f"✓ Matched: {speaker} -> {official['name']}")
                    return official

            # Fuzzy match on last word
            if fuzzy_match_name(speaker_last_word, official_last, threshold=0.85):
                print(f"✓ Matched: {speaker} -> {official['name']} (fuzzy last word)")
                return official

    # Strategy 6: Handle periods in names
    if '.' in cleaned_speaker:
        no_periods = cleaned_speaker.replace('.', ' ').strip()
        result = match_speaker_to_official(no_periods, officials, meeting_date)
        if result:
            return result

    return None


def process_transcripts(from_year=2000, to_year=2024, skip_existing=True):
    """Process all transcripts and extract speeches by officials"""
    officials = parse_officials(officials_text)

    os.makedirs('official_transcripts', exist_ok=True)

    existing_officials = set()
    if skip_existing:
        print("Checking for existing official files...")
        for official in officials:
            filename = official['name'].replace(' ', '_').replace(',', '').replace('.', '')
            output_path = f"official_transcripts/{filename}.csv"
            if os.path.exists(output_path):
                existing_officials.add(official['name'])
                print(f"  ✓ Found existing file for {official['name']} - will skip")

        if existing_officials:
            print(f"\nSkipping {len(existing_officials)} officials with existing files\n")

    official_data = defaultdict(list)
    official_date_sequences = defaultdict(lambda: defaultdict(int))

    total_speakers_found = 0
    matched_speakers = 0
    unmatched_speakers = set()

    for year in range(from_year, to_year + 1):
        if year > datetime.datetime.now().year:
            continue

        print(f"\nProcessing year {year}...")

        transcript_pdfs = find_transcript_pdfs_for_year(year)

        for transcript in transcript_pdfs:
            pdf_url = transcript['url']
            meeting_date = transcript['date']
            meeting_type = transcript['meeting_type']

            print(f"Processing transcript from {meeting_date.strftime('%Y-%m-%d')}: {pdf_url}")

            pdf_content = fetch_page(pdf_url, HEADERS)
            if not pdf_content:
                continue

            transcript_text = extract_text_from_pdf(pdf_content)
            if not transcript_text:
                continue

            speaker_segments = extract_speakers_and_statements(transcript_text)
            print(f"Found {len(speaker_segments)} speaker segments")

            for speaker, text in speaker_segments:
                total_speakers_found += 1

                matched_official = match_speaker_to_official(speaker, officials, meeting_date)

                if matched_official:
                    official_name = matched_official['name']

                    if skip_existing and official_name in existing_officials:
                        continue

                    matched_speakers += 1

                    date_key = meeting_date.strftime('%Y-%m-%d')
                    official_date_sequences[official_name][date_key] += 1
                    sequence_num = official_date_sequences[official_name][date_key]

                    transcript_id = generate_transcript_id(official_name, meeting_date, sequence_num)

                    official_data[official_name].append({
                        'id': transcript_id,
                        'Date': meeting_date.strftime('%Y-%m-%d'),
                        'Meeting_Type': meeting_type,
                        'Name': official_name,
                        'Role': matched_official['role'],
                        'Original_Speaker': speaker,
                        'Text': text,
                        'Transcript_URL': pdf_url
                    })
                else:
                    unmatched_speakers.add(speaker)
                    print(f"✗ No match for: {speaker}")

            time.sleep(2)

    # Save results with improved cleaning
    for official_name, entries in official_data.items():
        if entries:
            # Apply improved cleaning to each entry
            for entry in entries:
                entry['Text'] = clean_extracted_text(entry['Text'])

            df = pd.DataFrame(entries)
            filename = official_name.replace(' ', '_').replace(',', '').replace('.', '')
            output_path = f"official_transcripts/{filename}.csv"

            df.to_csv(output_path, index=False, encoding='utf-8', escapechar='\\')
            print(f"Saved {len(entries)} entries for {official_name} to {output_path}")

    # Print statistics
    print(f"\n=== MATCHING STATISTICS ===")
    print(f"Total speaker segments found: {total_speakers_found}")
    print(f"Successfully matched: {matched_speakers}")
    print(f"Unmatched: {len(unmatched_speakers)}")
    if total_speakers_found > 0:
        print(f"Match rate: {matched_speakers/total_speakers_found*100:.1f}%")

    print(f"\nFiles created for {len(official_data)} officials")
    return official_data


def main():
    from_year = 2000
    to_year = 2002

    skip_existing_files = False

    print(f"Extracting FOMC transcript data for officials from {from_year} to {to_year}")
    print("Using IMPROVED text cleaning, FUZZY MATCHING, and CONTAMINATION PREVENTION")
    if skip_existing_files:
        print("Will skip officials with existing files")
    else:
        print("Will re-scrape ALL officials")

    process_transcripts(from_year, to_year, skip_existing=skip_existing_files)


if __name__ == "__main__":
    main()

  while text_start < len(transcript_text) and transcript_text[text_start] in '.:;\s\n\d':


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
✗ No match for: CHAIRMAN
✗ No match for: MR
✓ Matched: CHAIRMAN GREENSPAN -> Alan Greenspan (fuzzy: 1.00)
✓ Matched: CHAIRMAN GREENSPAN -> Alan Greenspan (fuzzy: 1.00)
✗ No match for: MR
✓ Matched: CHAIRMAN GREENSPAN -> Alan Greenspan (fuzzy: 1.00)
✗ No match for: MR
✓ Matched: CHAIRMAN GREENSPAN -> Alan Greenspan (fuzzy: 1.00)
✗ No match for: MR. PRELL
✓ Matched: CHAIRMAN GREENSPAN -> Alan Greenspan (fuzzy: 1.00)
✓ Matched: MR. JORDAN -> Jerry Jordan (fuzzy: 1.00)
✗ No match for: MR
✓ Matched: CHAIRMAN GREENSPAN -> Alan Greenspan (fuzzy: 1.00)
✗ No match for: MR
✗ No match for: MR. KOHN
✗ No match for: MR
✗ No match for: MR. KOHN
✗ No match for: MR
✗ No match for: MR
✓ Matched: MR. GRAMLICH -> Edward M. Gramlich (fuzzy: 1.00)
✗ No match for: MR
✗ No match for: CHAIRMAN
✓ Matched: MR. PARRY -> Robert T. Parry (fuzzy: 1.00)
✗ No match for: MR. KOHN
✓ Matched: MR. PARRY -> Robert T. Parry (fuzzy: 1.00)
✗ No match for: CHAIR

In [18]:
"""
BATCH FIX ALL CSV FILES
Applies contamination fix + page number cleaning to entire directory
"""

import pandas as pd
import re
import glob
import os
from datetime import datetime


def clean_page_numbers_comprehensive(text):
    """
    IMPROVED: Remove ALL page number patterns including inline numbers
    """
    if not text:
        return text

    # Standalone digits between newlines
    text = re.sub(r'\n\s*\d{1,2}\s*\n', '\n', text)

    # CRITICAL: Digit at start of line followed by space and text (inline page numbers)
    # This catches: "\n7 to fill some orders" → "\nto fill some orders"
    text = re.sub(r'\n\s*\d{1,2}\s+(?=[a-z])', '\n', text)
    text = re.sub(r'\n\s*\d{1,2}\s+(?=[A-Z])', '\n', text)

    # At very beginning/end
    text = re.sub(r'^\s*\d{1,2}\s*\n', '', text)
    text = re.sub(r'\n\s*\d{1,2}\s*$', '', text)

    # Dash-formatted page numbers
    text = re.sub(r'\s*-\s*\d{1,3}\s*-\s*', ' ', text)
    text = re.sub(r'\n\s*-\s*\d{1,3}\s*-\s*\n', '\n', text)

    # "X of Y" patterns
    text = re.sub(r'\b\d+\s+of\s+\d+\b', '', text, flags=re.IGNORECASE)

    # "Page X" patterns
    text = re.sub(r'\bPage\s+\d+', '', text, flags=re.IGNORECASE)

    return text


def detect_speaker_transition(text, assigned_speaker):
    """
    IMPROVED: Detect if another speaker appears using ALL CAPS detection
    Now handles inline speakers (no newline required after name)
    Returns: (has_transition, position, speaker_name)
    """

    # IMPROVED patterns that don't require newline at the end
    patterns = [
        # Title + ALL CAPS name with footnote, followed by space OR newline
        # Matches: "\nMR. KOS.1 Thank you" OR "\nMR. KOS.1\nThank you"
        r'\n\s*((?:MR|MS|DR|CHAIRMAN|CHAIR|VICE CHAIRMAN|VICE CHAIR|GOVERNOR|PRESIDENT)\.?\s+[A-Z][A-Z]+(?:\s+[A-Z]+)*)\.?\d*\.?\s',

        # Just ALL CAPS name with footnote, followed by space OR newline
        r'\n\s*([A-Z][A-Z]+(?:\s+[A-Z]+)*)\.?\d*\.?\s',

        # Simple ALL CAPS (2+ letters)
        r'\n\s*([A-Z]{2,}(?:\s+[A-Z]{2,})*)\.?\d*\.?\s',
    ]

    # Normalize assigned speaker name
    assigned_upper = assigned_speaker.upper()
    # Remove common prefixes
    for prefix in ['CHAIRMAN ', 'VICE CHAIR ', 'VICE CHAIRMAN ', 'GOVERNOR ', 'PRESIDENT ', 'MR. ', 'MS. ', 'DR. ']:
        assigned_upper = assigned_upper.replace(prefix, '')
    assigned_upper = assigned_upper.strip()

    earliest_match = None
    earliest_pos = float('inf')
    earliest_speaker = None

    for pattern in patterns:
        matches = re.finditer(pattern, text, re.MULTILINE)

        for match in matches:
            embedded_speaker_raw = match.group(1).strip()

            # Clean footnotes
            embedded_speaker = re.sub(r'\.?\d+$', '', embedded_speaker_raw).strip().rstrip('.')
            embedded_upper = embedded_speaker.upper()

            # Skip if same speaker (flexible matching)
            if (embedded_upper in assigned_upper or
                assigned_upper in embedded_upper or
                embedded_upper == assigned_upper):
                continue

            # Skip non-speakers
            non_speakers = [
                'FEDERAL RESERVE', 'BOARD', 'MEETING', 'TRANSCRIPT', 'PAGE',
                'APPENDIX', 'EXHIBIT', 'TABLE', 'JANUARY', 'FEBRUARY', 'MARCH',
                'APRIL', 'MAY', 'JUNE', 'JULY', 'AUGUST', 'SEPTEMBER', 'OCTOBER',
                'NOVEMBER', 'DECEMBER', 'FOMC', 'GDP', 'CPI', 'NOTES', 'NOTE',
                'SEVERAL'  # Common when multiple people speak
            ]

            if any(word in embedded_upper for word in non_speakers):
                continue

            # Check if at start of line
            before_match = text[:match.start()]
            last_newline = before_match.rfind('\n')

            if last_newline != -1:
                text_after_newline = before_match[last_newline+1:].strip()
            else:
                text_after_newline = before_match.strip()

            # Skip mid-sentence references (but be more lenient)
            if len(text_after_newline) > 30:
                continue

            # Found genuine speaker transition
            if match.start() < earliest_pos:
                earliest_pos = match.start()
                earliest_match = match
                earliest_speaker = embedded_speaker

    if earliest_match:
        return True, earliest_pos, earliest_speaker

    return False, None, None


def fix_file(filepath, verbose=True):
    """
    Fix contamination in a single CSV file
    Returns: (fixed_count, total_entries)
    """
    try:
        df = pd.read_csv(filepath)
    except Exception as e:
        print(f"✗ Error reading {filepath}: {e}")
        return 0, 0

    fixed_count = 0

    for idx, row in df.iterrows():
        text = str(row['Text'])
        assigned_speaker = str(row.get('Original_Speaker', row.get('Name', 'Unknown')))

        # Clean page numbers
        text_cleaned = clean_page_numbers_comprehensive(text)

        # Check for contamination
        has_transition, position, embedded_speaker = detect_speaker_transition(text_cleaned, assigned_speaker)

        if has_transition:
            # Truncate at transition
            cleaned_text = text_cleaned[:position].strip()

            if verbose:
                print(f"  Fixed row {idx}: {assigned_speaker} (removed {len(text_cleaned) - len(cleaned_text)} chars containing '{embedded_speaker}')")

            df.at[idx, 'Text'] = cleaned_text
            fixed_count += 1
        elif text != text_cleaned:
            # Just page number cleaning
            df.at[idx, 'Text'] = text_cleaned

    # Save if any changes made
    if fixed_count > 0:
        df.to_csv(filepath, index=False, encoding='utf-8', escapechar='\\')

    return fixed_count, len(df)


def batch_fix_all_files(directory='official_transcripts', backup=True):
    """
    Fix all CSV files in directory
    """
    files = glob.glob(f'{directory}/*.csv')

    if not files:
        print(f"No CSV files found in {directory}")
        return

    print("="*80)
    print("BATCH FIX - ALL CSV FILES")
    print("="*80)
    print(f"Directory: {directory}")
    print(f"Files found: {len(files)}")
    print(f"Backup: {'YES' if backup else 'NO'}")
    print("="*80)

    # Create backup
    if backup:
        backup_dir = f"{directory}_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        os.makedirs(backup_dir, exist_ok=True)
        print(f"Backups: {backup_dir}\n")

    total_fixed = 0
    total_entries = 0
    files_with_fixes = 0

    for filepath in sorted(files):
        filename = os.path.basename(filepath)

        print(f"\nProcessing: {filename}")

        # Backup
        if backup:
            import shutil
            backup_path = os.path.join(backup_dir, filename)
            shutil.copy2(filepath, backup_path)

        # Fix
        fixed_count, entry_count = fix_file(filepath, verbose=False)

        total_fixed += fixed_count
        total_entries += entry_count

        if fixed_count > 0:
            files_with_fixes += 1
            print(f"  ✓ Fixed {fixed_count}/{entry_count} entries")
        else:
            print(f"  ✓ Clean ({entry_count} entries)")

    # Summary
    print("\n" + "="*80)
    print("BATCH FIX SUMMARY")
    print("="*80)
    print(f"Files processed: {len(files)}")
    print(f"Files with fixes: {files_with_fixes}")
    print(f"Total entries: {total_entries}")
    print(f"Fixed entries: {total_fixed}")

    if total_entries > 0:
        contamination_rate = (total_fixed / total_entries) * 100
        print(f"Contamination rate: {contamination_rate:.2f}%")

    if backup:
        print(f"\nBackups saved to: {backup_dir}")

    print("="*80)


if __name__ == "__main__":
    import sys

    directory = 'official_transcripts'

    # Check if directory exists
    if not os.path.exists(directory):
        print(f"Error: Directory '{directory}' not found")
        print("Update the 'directory' variable in this script to point to your CSV files")
        sys.exit(1)

    # Confirm before running
    print("\nBATCH FIX ALL CSV FILES")
    print(f"Directory: {directory}")
    print("\nThis will:")
    print("1. Create backups of all CSV files")
    print("2. Remove speaker contamination")
    print("3. Clean page numbers")
    print("4. Overwrite original files with fixed versions")

    response = input("\nContinue? (yes/no): ").strip().lower()

    if response not in ['yes', 'y']:
        print("Cancelled.")
        sys.exit(0)

    print("\n")
    batch_fix_all_files(directory, backup=True)
    print("\nDone!")


BATCH FIX ALL CSV FILES
Directory: official_transcripts

This will:
1. Create backups of all CSV files
2. Remove speaker contamination
3. Clean page numbers
4. Overwrite original files with fixed versions

Continue? (yes/no): yes


BATCH FIX - ALL CSV FILES
Directory: official_transcripts
Files found: 55
Backup: YES
Backups: official_transcripts_backup_20260120_183834


Processing: Alan_Greenspan.csv
  ✓ Clean (863 entries)

Processing: Alfred_Broaddus.csv
  ✓ Clean (23 entries)

Processing: Anthony_M_Santomero.csv
  ✓ Clean (19 entries)

Processing: Ben_Bernanke.csv
  ✓ Fixed 43/6838 entries

Processing: Charles_I_Plosser.csv
  ✓ Fixed 3/827 entries

Processing: Charles_L_Evans.csv
  ✓ Fixed 1/1183 entries

Processing: Daniel_Tarullo.csv
  ✓ Fixed 1/668 entries

Processing: Dennis_P_Lockhart.csv
  ✓ Clean (482 entries)

Processing: Donald_Kohn.csv
  ✓ Clean (15 entries)

Processing: Edward_G_Boehne.csv
  ✓ Clean (3 entries)

Processing: Edward_M_Gramlich.csv
  ✓ Clean (49 entries)

P

In [16]:
"""
IMPROVED TRANSCRIPT CLEANER
"""

import pandas as pd
import glob
import os
import re
import shutil
from datetime import datetime


def get_speaker_marker_pattern():
    """Return regex pattern that matches speaker markers"""
    return r'\b((?:MR|MS|DR|CHAIRMAN|CHAIR|VICE CHAIRMAN|VICE CHAIR|GOVERNOR|PRESIDENT)\.?\s+[A-Z]{2,}(?:\s+[A-Z]{2,})*)\.'


def protect_speaker_markers(text):
    """Replace speaker markers with placeholders to protect them during cleaning"""
    speaker_pattern = get_speaker_marker_pattern()
    markers = {}
    counter = [0]

    def replacer(match):
        marker = match.group(0)
        placeholder = f"__SPEAKER_MARKER_{counter[0]}__"
        markers[placeholder] = marker
        counter[0] += 1
        return placeholder

    # Protect speaker markers
    text = re.sub(r'\n\s*' + speaker_pattern, replacer, text)
    # Also protect markers at start of text
    text = re.sub(r'^\s*' + speaker_pattern, replacer, text)

    return text, markers


def restore_speaker_markers(text, markers):
    """Restore protected speaker markers"""
    for placeholder, marker in markers.items():
        text = text.replace(placeholder, marker)
    return text


def clean_text_comprehensive(text):
    """
    Comprehensive cleaning that PROTECTS speaker markers
    """
    if not text or pd.isna(text):
        return text

    # Step 1: Protect speaker markers
    text, speaker_markers = protect_speaker_markers(text)

    # Fix encoding issues
    text = text.replace('â€"', '—')
    text = text.replace('â€™', "'")
    text = text.replace('â€œ', '"')
    text = text.replace('â€', '"')

    # Protect U.S.
    text = re.sub(r'U\.S\.', 'U_S_', text)

    # ===== AGGRESSIVE PAGE NUMBER REMOVAL =====

    # 1. CRITICAL: Dash-formatted page numbers like "- 49 -" or "-49-"
    text = re.sub(r'\s*-\s*\d{1,3}\s*-\s*', ' ', text)
    text = re.sub(r'\n\s*-\s*\d{1,3}\s*-\s*\n', '\n', text)
    text = re.sub(r'^\s*-\s*\d{1,3}\s*-\s*', '', text)  # At start
    text = re.sub(r'-\s*\d{1,3}\s*-\s*$', '', text)  # At end

    # 2. Standalone numbers between newlines
    text = re.sub(r'\n\s*\d{1,3}\s*\n', '\n', text)

    # 3. At start of line before text
    text = re.sub(r'\n\s*\d{1,3}\s+(?=[A-Z])', '\n', text)

    # 4. At beginning of text
    text = re.sub(r'^\s*\d{1,3}\s*\n', '', text)

    # 5. At end of text
    text = re.sub(r'\n\s*\d{1,3}\s*$', '', text)

    # 6. After punctuation
    text = re.sub(r'([.!?])\s*\d{1,3}\s*\n', r'\1\n', text)

    # 7. "Page X of Y" patterns
    text = re.sub(r'\b(?:Page\s+)?\d+\s+of\s+\d+\b', '', text, flags=re.IGNORECASE)

    # 8. After commas/semicolons at line breaks
    text = re.sub(r'([,;])\s*\n\s*\d{1,3}\s+', r'\1 ', text)

    # 9. Within parentheses or brackets (sometimes page refs)
    text = re.sub(r'\(\s*\d{1,3}\s*\)', '', text)
    text = re.sub(r'\[\s*\d{1,3}\s*\]', '', text)

    # ===== DATE/HEADER REMOVAL =====

    # Full date headers with various dash types and page numbers
    text = re.sub(
        r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:[-–—]\d{1,2})?,?\s+\d{4}\s*(?:\d+\s+of\s+\d+)?',
        '', text, flags=re.IGNORECASE
    )

    # Date with dash followed by number (like "March 19, 2003 150 of 255")
    text = re.sub(
        r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:[-–—]\d{1,2})?,?\s+\d{4}[-–—]\d+',
        '', text, flags=re.IGNORECASE
    )

    # Session markers
    text = re.sub(
        r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}--(?:Morning|Afternoon|Evening)\s+Session',
        '', text, flags=re.IGNORECASE
    )

    # Standalone session markers
    text = re.sub(r'\b(?:Morning|Afternoon|Evening)\s+Session\b', '', text, flags=re.IGNORECASE)

    # Numeric dates
    text = re.sub(r'\d{1,2}/\d{1,2}/\d{2,4}\s*\d*', '', text)

    # ===== FIX LINE BREAKS (BUT PROTECT SPEAKER MARKERS) =====

    # Pattern 1: lowercase to lowercase
    text = re.sub(r'([a-z])\n(?!__SPEAKER_MARKER_)([a-z])', r'\1 \2', text)

    # Pattern 2: lowercase to uppercase (but not speaker markers)
    # Only join if it's likely a continuation, not a new sentence
    text = re.sub(r'([a-z])\n(?!__SPEAKER_MARKER_)([A-Z][a-z])', r'\1 \2', text)

    # Pattern 3: uppercase to lowercase (single letter words like "I", "A")
    text = re.sub(r'([A-Z])\n(?!__SPEAKER_MARKER_)([a-z])', r'\1 \2', text)

    # Pattern 4: period/digit to lowercase (mid-sentence continuations)
    text = re.sub(r'([.0-9])\n(?!__SPEAKER_MARKER_)([a-z])', r'\1 \2', text)

    # After commas/semicolons
    text = re.sub(r'([,;])\n(?!__SPEAKER_MARKER_)(?=[a-z])', r'\1 ', text)

    # ===== REMOVE COMMON ARTIFACTS =====

    text = re.sub(r'SPEAKER\(\?\)\.?\s*', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\bFOMC\s+Meeting\s+Transcript\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\bMeeting\s+of\s+the\s+Federal\s+Open\s+Market\s+Committee\b', '', text, flags=re.IGNORECASE)

    # Remove "The materials used by..." footnote references
    text = re.sub(r'The materials used by (?:Mr|Ms|Dr|Chairman|Chair)\.?\s+\w+\s+are appended[^.]*\.', '', text, flags=re.IGNORECASE)

    # Remove appendix references
    text = re.sub(r'\(appendix \d+\)', '', text, flags=re.IGNORECASE)

    # ===== SENTENCE FORMATTING =====

    # Add line break after periods followed by capital letters (but not protected markers or abbreviations)
    text = re.sub(r'\.(?!\s*U_S_)(?!\s*Mr)(?!\s*Ms)(?!\s*Dr)(?!__SPEAKER_MARKER_)(?!\s*\d)([A-Z])', r'.\n\1', text)

    # Restore U.S.
    text = text.replace('U_S_', 'U.S.')

    # ===== WHITESPACE CLEANUP =====

    text = re.sub(r'\n +', '\n', text)
    text = re.sub(r' +\n', '\n', text)
    text = re.sub(r' {2,}', ' ', text)
    text = re.sub(r'\n{3,}', '\n\n', text)

    # Remove leading/trailing whitespace from each line
    lines = text.split('\n')
    lines = [line.strip() for line in lines if line.strip()]
    text = '\n'.join(lines)

    # Step 2: Restore speaker markers
    text = restore_speaker_markers(text, speaker_markers)

    return text.strip()


def analyze_file_issues(filepath):
    """Analyze a file for remaining issues"""
    try:
        df = pd.read_csv(filepath)
    except:
        return None

    issues = {
        'dash_page_nums': 0,
        'standalone_page_nums': 0,
        'page_of_patterns': 0,
        'date_headers': 0,
        'embedded_speaker_markers': 0,
        'line_breaks_to_fix': 0,
        'total_entries': len(df)
    }

    for idx, row in df.iterrows():
        text = str(row['Text'])

        # Count dash-formatted page numbers
        issues['dash_page_nums'] += len(re.findall(r'-\s*\d{1,3}\s*-', text))

        # Count standalone page numbers
        issues['standalone_page_nums'] += len(re.findall(r'\n\s*\d{1,3}\s*\n', text))

        # Count "X of Y" patterns
        issues['page_of_patterns'] += len(re.findall(r'\d+\s+of\s+\d+', text))

        # Count date headers
        issues['date_headers'] += len(re.findall(
            r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}[-–—,]\s*\d{4}',
            text
        ))

        # Count embedded speaker markers (speaker markers that appear mid-text)
        # Look for patterns like "text MS. JOHNSON" or "text\nMR. SMITH"
        speaker_pattern = r'\b((?:MR|MS|DR|CHAIRMAN|CHAIR|VICE CHAIRMAN|VICE CHAIR|GOVERNOR|PRESIDENT)\.?\s+[A-Z]{2,}(?:\s+[A-Z]{2,})*)\.'
        speaker_matches = list(re.finditer(speaker_pattern, text))

        # If there are speaker markers NOT at the beginning of the text, count them
        for match in speaker_matches:
            # Check if this marker is NOT at the start (first 20 chars)
            if match.start() > 20:
                issues['embedded_speaker_markers'] += 1

        # Count problematic line breaks
        issues['line_breaks_to_fix'] += len(re.findall(r'([a-z])\n([a-z])', text))
        issues['line_breaks_to_fix'] += len(re.findall(r'([a-z])\n([A-Z])', text))

    return issues


def clean_all_transcript_files(directory='official_transcripts', backup=True, test_mode=False):
    """
    Clean all transcript CSV files in a directory
    """

    files = glob.glob(f'{directory}/*.csv')

    if not files:
        print(f"No CSV files found in {directory}")
        return

    print("="*80)
    print("IMPROVED TRANSCRIPT CLEANER")
    print("="*80)
    print(f"Found {len(files)} CSV files")
    print(f"Backup: {'YES' if backup else 'NO'}")
    print(f"Test mode: {'YES - no changes will be made' if test_mode else 'NO - files will be modified'}")
    print("="*80)

    # Create backup directory if needed
    if backup and not test_mode:
        backup_dir = f"{directory}_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        os.makedirs(backup_dir, exist_ok=True)
        print(f"Backups will be saved to: {backup_dir}\n")

    # Statistics
    total_stats = {
        'files_processed': 0,
        'files_with_issues': 0,
        'total_issues_found': 0,
        'total_issues_fixed': 0,
        'total_entries_cleaned': 0
    }

    # Process each file
    for filepath in sorted(files):
        filename = os.path.basename(filepath)

        try:
            # Analyze before cleaning
            issues_before = analyze_file_issues(filepath)

            if issues_before is None:
                print(f"✗ {filename}: Error reading file")
                continue

            total_issues = (issues_before['dash_page_nums'] +
                          issues_before['standalone_page_nums'] +
                          issues_before['page_of_patterns'] +
                          issues_before['date_headers'] +
                          issues_before['embedded_speaker_markers'] +
                          issues_before['line_breaks_to_fix'])

            if total_issues == 0:
                print(f"✓ {filename}: Already clean (skipping)")
                continue

            print(f"\n{'='*80}")
            print(f"Processing: {filename}")
            print(f"{'='*80}")
            print(f"Entries: {issues_before['total_entries']}")
            print(f"Issues found:")
            print(f"  - Dash page numbers (-49-): {issues_before['dash_page_nums']}")
            print(f"  - Standalone page numbers: {issues_before['standalone_page_nums']}")
            print(f"  - 'X of Y' patterns: {issues_before['page_of_patterns']}")
            print(f"  - Date headers: {issues_before['date_headers']}")
            print(f"  - Embedded speaker markers: {issues_before['embedded_speaker_markers']}")
            print(f"  - Line breaks to fix: {issues_before['line_breaks_to_fix']}")
            print(f"  TOTAL: {total_issues}")

            if test_mode:
                print("TEST MODE: Would clean this file")
                total_stats['files_with_issues'] += 1
                total_stats['total_issues_found'] += total_issues
                continue

            # Load and clean
            df = pd.read_csv(filepath)

            # Apply cleaning to Text column
            print("Cleaning...", end=" ", flush=True)
            df['Text'] = df['Text'].apply(clean_text_comprehensive)
            print("✓")

            # Create backup
            if backup:
                backup_path = os.path.join(backup_dir, filename)
                shutil.copy2(filepath, backup_path)
                print(f"Backup created: {backup_path}")

            # Save cleaned version
            df.to_csv(filepath, index=False, encoding='utf-8', escapechar='\\')

            # Verify improvements
            issues_after = analyze_file_issues(filepath)
            total_issues_after = (issues_after['dash_page_nums'] +
                                issues_after['standalone_page_nums'] +
                                issues_after['page_of_patterns'] +
                                issues_after['date_headers'] +
                                issues_after['embedded_speaker_markers'] +
                                issues_after['line_breaks_to_fix'])

            issues_fixed = total_issues - total_issues_after

            # Update statistics
            total_stats['files_processed'] += 1
            total_stats['files_with_issues'] += 1
            total_stats['total_issues_found'] += total_issues
            total_stats['total_issues_fixed'] += issues_fixed
            total_stats['total_entries_cleaned'] += issues_before['total_entries']

            print(f"✓ Cleaned successfully!")
            print(f"  Issues remaining: {total_issues_after} (fixed {issues_fixed}/{total_issues})")

        except Exception as e:
            print(f"✗ Error processing {filename}: {e}")

    # Print summary
    print("\n" + "="*80)
    print("CLEANING SUMMARY")
    print("="*80)

    if test_mode:
        print(f"TEST MODE - No changes made")
        print(f"Files that need cleaning: {total_stats['files_with_issues']}")
        print(f"Total issues found: {total_stats['total_issues_found']}")
    else:
        print(f"Files processed: {total_stats['files_processed']}")
        print(f"Files with issues: {total_stats['files_with_issues']}")
        print(f"Total issues found: {total_stats['total_issues_found']}")
        print(f"Total issues fixed: {total_stats['total_issues_fixed']}")
        print(f"Success rate: {total_stats['total_issues_fixed']/total_stats['total_issues_found']*100:.1f}%")
        print(f"Total entries cleaned: {total_stats['total_entries_cleaned']}")

        if backup:
            print(f"\nBackups saved to: {backup_dir}")

    print("="*80)


if __name__ == "__main__":
    import sys

    # Configuration
    directory = 'official_transcripts'  # Change if your files are elsewhere
    create_backup = True                # Set to False if you don't want backups
    test_mode = False                   # Set to True to see what would be cleaned

    # Check if directory exists
    if not os.path.exists(directory):
        print(f"\nError: Directory '{directory}' not found!")
        print("Please update the 'directory' variable in this script.")
        sys.exit(1)

    print("\nIMPROVED TRANSCRIPT CLEANER")
    print("="*80)
    print(f"Target directory: {directory}")
    print(f"Create backups: {create_backup}")
    print(f"Test mode: {test_mode}")
    print("="*80)

    if not test_mode:
        # Confirm before proceeding
        response = input("\nProceed with cleaning? (yes/no): ").strip().lower()

        if response not in ['yes', 'y']:
            print("Operation cancelled.")
            sys.exit(0)

    print("\nStarting cleaning process...\n")
    clean_all_transcript_files(directory, backup=create_backup, test_mode=test_mode)
    print("\nDone!")


IMPROVED TRANSCRIPT CLEANER
Target directory: official_transcripts
Create backups: True
Test mode: False

Proceed with cleaning? (yes/no): yes

Starting cleaning process...

IMPROVED TRANSCRIPT CLEANER
Found 55 CSV files
Backup: YES
Test mode: NO - files will be modified
Backups will be saved to: official_transcripts_backup_20260120_181816


Processing: Alan_Greenspan.csv
Entries: 1881
Issues found:
  - Dash page numbers (-49-): 0
  - Standalone page numbers: 0
  - 'X of Y' patterns: 0
  - Date headers: 304
  - Embedded speaker markers: 44
  - Line breaks to fix: 94
  TOTAL: 442
Cleaning... ✓
Backup created: official_transcripts_backup_20260120_181816/Alan_Greenspan.csv
✓ Cleaned successfully!
  Issues remaining: 48 (fixed 394/442)

Processing: Alfred_Broaddus.csv
Entries: 172
Issues found:
  - Dash page numbers (-49-): 0
  - Standalone page numbers: 0
  - 'X of Y' patterns: 0
  - Date headers: 50
  - Embedded speaker markers: 0
  - Line breaks to fix: 27
  TOTAL: 77
Cleaning... ✓
Bac

In [None]:
import pandas as pd
import os
from collections import defaultdict

def clean_duplicate_transcripts(input_dir='official_transcripts', output_dir='cleaned_transcripts'):
    """
    Aggregates multiple transcript entries for the same official on the same date
    into a single entry by concatenating the text fields.
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Get list of all CSV files in the input directory
    csv_files = [f for f in os.listdir(input_dir) if f.endswith('.csv')]

    print(f"Found {len(csv_files)} CSV files to process")

    for csv_file in csv_files:
        input_path = os.path.join(input_dir, csv_file)
        print(f"Processing {csv_file}...")

        # Read the CSV file
        try:
            df = pd.read_csv(input_path, encoding='utf-8')
        except Exception as e:
            print(f"Error reading {csv_file}: {e}")
            continue

        if df.empty:
            print(f"  Skipping empty file: {csv_file}")
            continue

        # Ensure required columns are present
        required_columns = ['id', 'Date', 'Name', 'Role', 'Text', 'Transcript_URL']
        if not all(col in df.columns for col in required_columns):
            print(f"  Skipping {csv_file}: Missing required columns")
            continue

        # Group by Name and Date, aggregating other fields
        aggregated_data = []
        grouped = df.groupby(['Name', 'Date'])

        for (name, date), group in grouped:
            # Concatenate all text entries for this official-date pair
            combined_text = ' '.join(group['Text'].astype(str))

            # Take the first entry's values for other fields
            first_entry = group.iloc[0]

            # Generate a new ID without sequence number (e.g., ABroaddus_20000202_transcript)
            name_parts = name.split()
            if len(name_parts) >= 2:
                first_initial = name_parts[0][0].upper()
                last_name = name_parts[-1]
                name_part = f"{first_initial}{last_name}"
            else:
                name_part = name.replace(' ', '')
            date_part = date.replace('-', '')
            new_id = f"{name_part}_{date_part}_transcript"

            # Create the aggregated entry
            aggregated_data.append({
                'id': new_id,
                'Date': date,
                'Name': name,
                'Role': first_entry['Role'],
                'Original_Speaker': first_entry['Original_Speaker'],
                'Text': combined_text,
                'Transcript_URL': first_entry['Transcript_URL']
            })

        # Create a new DataFrame with aggregated data
        cleaned_df = pd.DataFrame(aggregated_data)

        # Save to new CSV in output directory
        output_filename = csv_file  # Keep the same filename
        output_path = os.path.join(output_dir, output_filename)
        cleaned_df.to_csv(output_path, index=False, encoding='utf-8', escapechar='\\')
        print(f"  Saved cleaned data to {output_path}: {len(cleaned_df)} entries (reduced from {len(df)})")

    print("\nCleaning complete!")

if __name__ == "__main__":
    clean_duplicate_transcripts()

Found 54 CSV files to process
Processing Alan_Greenspan.csv...
  Saved cleaned data to cleaned_transcripts/Alan_Greenspan.csv: 50 entries (reduced from 2750)
Processing Roger_W_Ferguson.csv...
  Saved cleaned data to cleaned_transcripts/Roger_W_Ferguson.csv: 50 entries (reduced from 401)
Processing William_McDonough.csv...
  Saved cleaned data to cleaned_transcripts/William_McDonough.csv: 27 entries (reduced from 271)
Processing Alfred_Broaddus.csv...
  Saved cleaned data to cleaned_transcripts/Alfred_Broaddus.csv: 36 entries (reduced from 177)
Processing Edward_G_Boehne.csv...
  Saved cleaned data to cleaned_transcripts/Edward_G_Boehne.csv: 2 entries (reduced from 11)
Processing Jerry_Jordan.csv...
  Saved cleaned data to cleaned_transcripts/Jerry_Jordan.csv: 23 entries (reduced from 159)
Processing Robert_T_Parry.csv...
  Saved cleaned data to cleaned_transcripts/Robert_T_Parry.csv: 35 entries (reduced from 275)
Processing Gary_H_Stern.csv...
  Saved cleaned data to cleaned_transcrip

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import os
import pandas as pd
import numpy as np
import re
import json
import random
import time
from glob import glob

# Set random seed for reproducibility
seed = int(time.time())
random.seed(seed)
np.random.seed(seed)

# Directory paths
input_dir = '/content/drive/MyDrive/FedComs/Transcripts/cleaned_transcripts'
cleaned_output_dir = '/content/drive/MyDrive/FedComs/Transcripts/final_transcripts'


# Create output directories if they don't exist
for directory in [cleaned_output_dir, validation_output_dir, summary_output_dir]:
    if not os.path.exists(directory):
        os.makedirs(directory)

os.chdir(summary_output_dir)
print(f"Current working directory: {os.getcwd()}")

# ============================================================================
# STEP 1: CLEAN TRANSCRIPTS
# ============================================================================

def clean_page_numbers(text):
    """Remove page number patterns from transcript text."""
    # Pattern 1: "April 26–27, 2011 52 of 244" or "April 26-27, 2011 52 of 244"
    # Handles both en-dash (–), em-dash (—), and regular dash (-)
    text = re.sub(
        r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+'
        r'\d{1,2}(?:[-–—]\d{1,2})?,\s+\d{4}\s+\d+\s+of\s+\d+',
        '',
        text
    )

    # Pattern 2: Date ranges that may be incomplete/cut off
    # "April 30–May 1" or "April 30–" or similar with various dash types
    text = re.sub(
        r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+'
        r'\d{1,2}[-–—]+'
        r'(?:(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2})?',
        '',
        text
    )

    # Pattern 3: Standalone page numbers like "52 of 244" that might be on their own line
    text = re.sub(r'\n\s*\d+\s+of\s+\d+\s*\n', '\n', text)

    # Pattern 4: Date formats with forward slashes: "6/26-27/01 87"
    text = re.sub(
        r'\d{1,2}/\d{1,2}(?:[-–—]\d{1,2})?/\d{2,4}\s+\d+',
        '',
        text
    )

    return text

def fix_text_encoding(text):
    """Fix common text encoding issues."""
    text = text.replace('â€"', '—')
    text = text.replace('â€"', '—')
    text = text.replace('â€œ', '"')
    text = text.replace('â€', '"')
    text = text.replace('\u2013', '–')
    text = text.replace('\u2014', '—')
    text = text.replace('\u2018', "'")
    text = text.replace('\u2019', "'")
    text = text.replace('\u201c', '"')
    text = text.replace('\u201d', '"')
    text = text.replace('\u2026', '...')
    text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]', '', text)
    return text

def clean_transcript_text(text):
    """Clean transcript text by fixing encoding and removing page numbers."""
    text = fix_text_encoding(text)
    text = clean_page_numbers(text)

    # Clean up excessive whitespace
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple newlines to double
    text = re.sub(r' +', ' ', text)  # Multiple spaces to single

    return text.strip()

print("\nCleaning transcript files...")
print(f"Reading from: {input_dir}")

# Get all CSV files in the input directory
csv_files = glob(os.path.join(input_dir, '*.csv'))
print(f"Found {len(csv_files)} transcript files")

if len(csv_files) == 0:
    print("ERROR: No CSV files found in input directory!")
    print(f"Please check that files exist in: {input_dir}")
else:
    # Process each file
    cleaned_count = 0
    for csv_file in csv_files:
        filename = os.path.basename(csv_file)
        print(f"Processing {filename}...")

        try:
            # Read the transcript file
            df = pd.read_csv(csv_file)

            if 'Text' not in df.columns:
                print(f"  Warning: No 'Text' column found in {filename}, skipping...")
                continue

            # Clean the text
            df['Text'] = df['Text'].apply(lambda x: clean_transcript_text(str(x)) if pd.notna(x) else '')

            # Save cleaned version
            output_file = os.path.join(cleaned_output_dir, filename.replace('cleaned_', ''))
            df.to_csv(output_file, index=False)
            cleaned_count += 1

        except Exception as e:
            print(f"  Error processing {filename}: {e}")
            continue

    print(f"\nCleaned {cleaned_count} transcript files")
    print(f"Cleaned files saved to: {cleaned_output_dir}")