In [13]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import os
import pandas as pd
import numpy as np
import re
import json
import random
import time
from glob import glob

# Set random seed for reproducibility
seed = int(time.time())
random.seed(seed)
np.random.seed(seed)

# Directory paths
input_dir = '/content/drive/MyDrive/FedComs/Speeches/fed_speeches'
cleaned_output_dir = '/content/drive/MyDrive/FedComs/Speeches/fed_speeches_clean'

# ============================================================================
# STEP 1: CLEAN SPEECHES
# ============================================================================

def remove_video_controls_text(text):
    """Remove the video player control instructions."""
    video_controls_pattern = (
        r'\[Space Bar\] toggles play/pause;'
        r'\[Right/Left Arrows\] seeks the video forwards and back \(5 sec \);'
        r'\[Up/Down Arrows\] increase/decrease volume;'
        r'\[F\] toggles fullscreen on/off \(Except IE 11\);'
        r'The \[Tab\] key may be used in combination with the \[Enter/Return\] key '
        r'to navigate and activate control buttons, such as caption on/off\.;'
    )
    text = re.sub(video_controls_pattern, '', text, flags=re.IGNORECASE)

    # Also handle variations with different spacing/formatting
    text = re.sub(r'\[Space Bar\].*?caption on/off\.;?\s*', '', text, flags=re.IGNORECASE | re.DOTALL)

    return text

def remove_references_section(text):
    """
    Remove bibliography/references section with careful handling.
    Only removes if 'References' appears to be a section header.
    """
    lines = text.split('\n')
    references_idx = -1

    for i, line in enumerate(lines):
        line_stripped = line.strip()

        # Check if line is just "References" or "REFERENCES" (case insensitive)
        if re.match(r'^references?$', line_stripped, re.IGNORECASE):
            # Verify next few lines look like bibliography entries
            next_lines = '\n'.join(lines[i+1:min(i+4, len(lines))])

            # Check for bibliography indicators in next few lines
            bibliography_indicators = [
                r'\d{4}[a-z]?\.?\s',  # Year followed by period/space
                r'\([12]\d{3}[a-z]?\)',  # Year in parentheses
                r'[A-Z][a-z]+,\s+[A-Z]\.',  # Last name, First initial
                r'Journal of',
                r'Review of',
                r'Federal Reserve',
                r'Working Paper',
                r'https?://',
                r'doi:',
            ]

            if any(re.search(pattern, next_lines, re.IGNORECASE) for pattern in bibliography_indicators):
                references_idx = i
                break

    # If we found a References section, remove it
    if references_idx >= 0:
        text = '\n'.join(lines[:references_idx])

    return text

def fix_text_encoding(text):
    """Fix common text encoding issues."""
    text = text.replace('â€"', '—')
    text = text.replace('â€"', '—')
    text = text.replace('â€œ', '"')
    text = text.replace('â€', '"')
    text = text.replace('\u2013', '–')
    text = text.replace('\u2014', '—')
    text = text.replace('\u2018', "'")
    text = text.replace('\u2019', "'")
    text = text.replace('\u201c', '"')
    text = text.replace('\u201d', '"')
    text = text.replace('\u2026', '...')
    text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]', '', text)
    return text

def clean_speech_text(text):
    """Clean speech text by fixing encoding and removing unwanted content."""
    text = fix_text_encoding(text)
    text = remove_video_controls_text(text)
    text = remove_references_section(text)

    # Clean up excessive whitespace
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
    text = re.sub(r' +', ' ', text)

    return text.strip()

print("\nCleaning speech files...")
print(f"Reading from: {input_dir}")

# Get all CSV files in the input directory
csv_files = glob(os.path.join(input_dir, '*.csv'))
print(f"Found {len(csv_files)} speech files")

if len(csv_files) == 0:
    print("ERROR: No CSV files found in input directory!")
    print(f"Please check that files exist in: {input_dir}")
else:
    # Process each file
    cleaned_count = 0
    for csv_file in csv_files:
        filename = os.path.basename(csv_file)
        print(f"Processing {filename}...")

        try:
            # Read the speech file
            df = pd.read_csv(csv_file, encoding='utf-8', encoding_errors='replace')

            # Find the text column
            text_col = None
            for col in df.columns:
                if 'text' in col.lower():
                    text_col = col
                    break

            if text_col is None:
                print(f"  Warning: No text column found in {filename}, skipping...")
                continue

            # Clean the text
            df[text_col] = df[text_col].apply(lambda x: clean_speech_text(str(x)) if pd.notna(x) else '')

            # Save cleaned version
            output_file = os.path.join(cleaned_output_dir, filename)
            df.to_csv(output_file, index=False)
            cleaned_count += 1

        except Exception as e:
            print(f"  Error processing {filename}: {e}")
            continue

    print(f"\nCleaned {cleaned_count} speech files")
    print(f"Cleaned files saved to: {cleaned_output_dir}")


Mounted at /content/drive

Cleaning speech files...
Reading from: /content/drive/MyDrive/FedComs/Speeches/fed_speeches
Found 25 speech files
Processing susan_bies_speeches.csv...
Processing frederic_mishkin_speeches.csv...
Processing ben_bernanke_speeches.csv...
Processing donald_kohn_speeches.csv...
Processing mark_olson_speeches.csv...
Processing roger_jr_speeches.csv...
Processing kevin_warsh_speeches.csv...
Processing randall_kroszner_speeches.csv...
Processing sarah_raskin_speeches.csv...
Processing janet_yellen_speeches.csv...
Processing elizabeth_duke_speeches.csv...
Processing daniel_tarullo_speeches.csv...
Processing jeremy_stein_speeches.csv...
Processing jerome_powell_speeches.csv...
Processing stanley_fischer_speeches.csv...
Processing lael_brainard_speeches.csv...
Processing randal_quarles_speeches.csv...
Processing richard_clarida_speeches.csv...
Processing michelle_bowman_speeches.csv...
Processing christopher_waller_speeches.csv...
Processing michael_barr_speeches.csv..

In [12]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import os
import pandas as pd
import numpy as np
import re
import json
import random
import time
from glob import glob

# Set random seed for reproducibility
seed = int(time.time())
random.seed(seed)
np.random.seed(seed)

# Directory paths
dict_dir = '/content/drive/MyDrive/FedComs/Dictionaries'
input_dir = '/content/drive/MyDrive/FedComs/Speeches/fed_speeches'
cleaned_output_dir = '/content/drive/MyDrive/FedComs/Speeches/fed_speeches_clean'
validation_output_dir = '/content/drive/MyDrive/FedComs/Validation_Sets'
summary_output_dir = '/content/drive/MyDrive/FedComs/Speeches'

# Create output directories if they don't exist
for directory in [cleaned_output_dir, validation_output_dir, summary_output_dir]:
    if not os.path.exists(directory):
        os.makedirs(directory)

os.chdir(summary_output_dir)
print(f"Current working directory: {os.getcwd()}")

# ============================================================================
# STEP 1: CLEAN SPEECHES
# ============================================================================

def remove_video_controls_text(text):
    """Remove the video player control instructions."""
    video_controls_pattern = (
        r'\[Space Bar\] toggles play/pause;'
        r'\[Right/Left Arrows\] seeks the video forwards and back \(5 sec \);'
        r'\[Up/Down Arrows\] increase/decrease volume;'
        r'\[F\] toggles fullscreen on/off \(Except IE 11\);'
        r'The \[Tab\] key may be used in combination with the \[Enter/Return\] key '
        r'to navigate and activate control buttons, such as caption on/off\.;'
    )
    text = re.sub(video_controls_pattern, '', text, flags=re.IGNORECASE)

    # Also handle variations with different spacing/formatting
    text = re.sub(r'\[Space Bar\].*?caption on/off\.;?\s*', '', text, flags=re.IGNORECASE | re.DOTALL)

    return text

def remove_references_section(text):
    """
    Remove bibliography/references section with careful handling.
    Only removes if 'References' appears to be a section header.
    """
    lines = text.split('\n')
    references_idx = -1

    for i, line in enumerate(lines):
        line_stripped = line.strip()

        # Check if line is just "References" or "REFERENCES" (case insensitive)
        if re.match(r'^references?$', line_stripped, re.IGNORECASE):
            # Verify next few lines look like bibliography entries
            next_lines = '\n'.join(lines[i+1:min(i+4, len(lines))])

            # Check for bibliography indicators in next few lines
            bibliography_indicators = [
                r'\d{4}[a-z]?\.?\s',  # Year followed by period/space
                r'\([12]\d{3}[a-z]?\)',  # Year in parentheses
                r'[A-Z][a-z]+,\s+[A-Z]\.',  # Last name, First initial
                r'Journal of',
                r'Review of',
                r'Federal Reserve',
                r'Working Paper',
                r'https?://',
                r'doi:',
            ]

            if any(re.search(pattern, next_lines, re.IGNORECASE) for pattern in bibliography_indicators):
                references_idx = i
                break

    # If we found a References section, remove it
    if references_idx >= 0:
        text = '\n'.join(lines[:references_idx])

    return text

def fix_text_encoding(text):
    """Fix common text encoding issues."""
    text = text.replace('â€"', '—')
    text = text.replace('â€"', '—')
    text = text.replace('â€œ', '"')
    text = text.replace('â€', '"')
    text = text.replace('\u2013', '–')
    text = text.replace('\u2014', '—')
    text = text.replace('\u2018', "'")
    text = text.replace('\u2019', "'")
    text = text.replace('\u201c', '"')
    text = text.replace('\u201d', '"')
    text = text.replace('\u2026', '...')
    text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]', '', text)
    return text

def clean_speech_text(text):
    """Clean speech text by fixing encoding and removing unwanted content."""
    text = fix_text_encoding(text)
    text = remove_video_controls_text(text)
    text = remove_references_section(text)

    # Clean up excessive whitespace
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
    text = re.sub(r' +', ' ', text)

    return text.strip()

print("\nCleaning speech files...")
print(f"Reading from: {input_dir}")

# Get all CSV files in the input directory
csv_files = glob(os.path.join(input_dir, '*.csv'))
print(f"Found {len(csv_files)} speech files")

if len(csv_files) == 0:
    print("ERROR: No CSV files found in input directory!")
    print(f"Please check that files exist in: {input_dir}")
else:
    # Process each file
    cleaned_count = 0
    for csv_file in csv_files:
        filename = os.path.basename(csv_file)
        print(f"Processing {filename}...")

        try:
            # Read the speech file
            df = pd.read_csv(csv_file, encoding='utf-8', encoding_errors='replace')

            # Find the text column
            text_col = None
            for col in df.columns:
                if 'text' in col.lower():
                    text_col = col
                    break

            if text_col is None:
                print(f"  Warning: No text column found in {filename}, skipping...")
                continue

            # Clean the text
            df[text_col] = df[text_col].apply(lambda x: clean_speech_text(str(x)) if pd.notna(x) else '')

            # Save cleaned version
            output_file = os.path.join(cleaned_output_dir, filename)
            df.to_csv(output_file, index=False)
            cleaned_count += 1

        except Exception as e:
            print(f"  Error processing {filename}: {e}")
            continue

    print(f"\nCleaned {cleaned_count} speech files")
    print(f"Cleaned files saved to: {cleaned_output_dir}")

# ============================================================================
# STEP 2: LOAD DICTIONARIES
# ============================================================================

print("\n" + "="*70)
print("LOADING DICTIONARIES")
print("="*70)

with open(os.path.join(dict_dir, 'labor_indicators.json'), 'r') as f:
    LABOR_INDICATORS = json.load(f)

with open(os.path.join(dict_dir, 'inflation_indicators.json'), 'r') as f:
    INFLATION_INDICATORS = json.load(f)

with open(os.path.join(dict_dir, 'inflation_pattern_mapping.json'), 'r') as f:
    INFLATION_PATTERN_TO_INDICATOR = json.load(f)

print("Dictionaries loaded successfully!")
print(f"Labor indicators: {list(LABOR_INDICATORS.keys())}")
print(f"Inflation categories: {list(INFLATION_INDICATORS.keys())}")

# ============================================================================
# STEP 3: SENTENCE SPLITTING AND CLASSIFICATION FUNCTIONS
# ============================================================================

def split_into_sentences(text):
    """Split text into sentences, preserving initials and abbreviations."""
    text = fix_text_encoding(text)

    abbreviations = [
        r'\bU\.S\.A\.', r'\bU\.S\.', r'\bU\.K\.', r'\bE\.U\.',
        r'\bSt\.', r'\bMr\.', r'\bMrs\.', r'\bMs\.', r'\bDr\.',
        r'\bProf\.', r'\bSr\.', r'\bJr\.', r'\bvs\.', r'\betc\.',
        r'\bi\.e\.', r'\be\.g\.', r'\bVol\.', r'\bNo\.', r'\bpp\.',
        r'\bCo\.', r'\bInc\.', r'\bLtd\.', r'\bCorp\.',
        r'\bPh\.D\.', r'\bM\.A\.', r'\bM\.S\.', r'\bB\.A\.',
        r'\bD\.C\.', r'\bA\.M\.', r'\bP\.M\.'
    ]

    for idx, abbr in enumerate(abbreviations):
        text = re.sub(abbr, f'<ABBR_{idx}>', text, flags=re.IGNORECASE)

    text = re.sub(r'\b([A-Z])\.(\s+[A-Z]\.)*(?=\s+[A-Z][a-z]+)', lambda m: m.group(0).replace('.', f'<NAME>'), text)
    text = re.sub(r'\b\d+\.\d+\b', lambda m: m.group(0).replace('.', '<DEC>'), text)

    voting_pattern = r'((?:Voting for|Voting against)\s+[^.!?]+?)([.!?]+\s+|$)'
    voting_matches = []
    def store_voting_match(match):
        voting_matches.append(match.group(1))
        return f'<VOTE_{len(voting_matches) - 1}>'
    text = re.sub(voting_pattern, store_voting_match, text)

    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z]|$)', text)
    sentences = [s.strip() for s in sentences if s.strip()]

    restored_sentences = []
    for sentence in sentences:
        for idx in range(len(abbreviations)):
            sentence = sentence.replace(f'<ABBR_{idx}>', abbreviations[idx].replace(r'\b', '').replace(r'\.', '.'))
        sentence = sentence.replace('<NAME>', '.')
        sentence = sentence.replace('<DEC>', '.')
        for i, voting_list in enumerate(voting_matches):
            placeholder = f'<VOTE_{i}>'
            if placeholder in sentence:
                sentence = sentence.replace(placeholder, voting_list)
        restored_sentences.append(sentence)

    return restored_sentences

def check_keywords_in_sentence(sentence, keywords):
    """Check if any keyword appears in the sentence."""
    sentence_lower = sentence.lower()
    for keyword in keywords:
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        if re.search(pattern, sentence_lower):
            return True
    return False

def check_employment_indicator(sentence, keywords):
    """Check for Employment indicator, excluding maximum/full employment."""
    sentence_lower = sentence.lower()

    if re.search(r'\b(?:maximum|full)\s+employment\b', sentence_lower):
        return False
    if re.search(r'\bemployment\s+goal\b', sentence_lower):
        return False

    for keyword in keywords:
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        if re.search(pattern, sentence_lower):
            return True
    return False

def check_general_labor_term(sentence):
    """Check if sentence contains general labor terms."""
    sentence_lower = sentence.lower()
    general_labor_keywords = LABOR_INDICATORS.get("General Labor", [])
    for keyword in general_labor_keywords:
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        if re.search(pattern, sentence_lower):
            return True
    return False

def check_general_inflation_terms(sentence):
    """Check if sentence contains general inflation terms."""
    sentence_lower = sentence.lower()
    general_inflation_patterns = INFLATION_INDICATORS.get("General Inflation", {}).get("general_patterns", [])
    for pattern in general_inflation_patterns:
        if re.search(pattern, sentence_lower, re.IGNORECASE):
            return True
    return False

def check_inflation_sentence(sentence):
    """Check if sentence mentions any inflation indicator."""
    mentioned_indicators = set()
    sentence_lower = sentence.lower()

    for category, subcategories in INFLATION_INDICATORS.items():
        for pattern_name, pattern_list in subcategories.items():
            for pattern in pattern_list:
                if re.search(pattern, sentence_lower, re.IGNORECASE):
                    indicator_name = INFLATION_PATTERN_TO_INDICATOR.get(pattern_name, "Other")
                    mentioned_indicators.add(indicator_name)
                    break

    if "Core_CPI" in mentioned_indicators and "Core" in mentioned_indicators:
        mentioned_indicators.discard("Core")

    if "Core_PCE" in mentioned_indicators and "Core" in mentioned_indicators:
        mentioned_indicators.discard("Core")

    if "Headline_CPI" in mentioned_indicators and "Headline" in mentioned_indicators:
        mentioned_indicators.discard("Headline")

    if "Headline_PCE" in mentioned_indicators and "Headline" in mentioned_indicators:
        mentioned_indicators.discard("Headline")

    return mentioned_indicators

def classify_sentence(sentence):
    """Classify a single sentence and return its indicators."""
    labor_specific_found = False
    labor_indicators_in_sentence = set()

    for indicator, keywords in LABOR_INDICATORS.items():
        if indicator == "General Labor":
            continue

        if indicator == "Employment":
            if check_employment_indicator(sentence, keywords):
                labor_indicators_in_sentence.add(indicator)
                labor_specific_found = True
        else:
            if check_keywords_in_sentence(sentence, keywords):
                labor_indicators_in_sentence.add(indicator)
                labor_specific_found = True

    labor_general_found = check_general_labor_term(sentence)
    labor_found = labor_specific_found or labor_general_found

    inflation_indicators_in_sentence = check_inflation_sentence(sentence)
    inflation_specific_found = bool(inflation_indicators_in_sentence)

    inflation_general_found = check_general_inflation_terms(sentence)
    inflation_found = inflation_specific_found or inflation_general_found

    if labor_found and inflation_found:
        classification = "Both"
    elif labor_found:
        classification = "Labor"
    elif inflation_found:
        classification = "Inflation"
    else:
        classification = "Neither"

    return {
        'classification': classification,
        'labor_indicators': list(labor_indicators_in_sentence),
        'inflation_indicators': list(inflation_indicators_in_sentence)
    }

def analyze_speech(text):
    """Analyze a single speech for labor and inflation content."""
    sentences = split_into_sentences(text)
    total_sentences = len(sentences)

    labor_sentences = 0
    inflation_sentences = 0
    both_sentences = 0

    labor_indicator_counts = {indicator: 0 for indicator in LABOR_INDICATORS.keys() if indicator != "General Labor"}
    inflation_indicator_list = sorted(list(set(
        indicator for indicator in INFLATION_PATTERN_TO_INDICATOR.values()
        if indicator not in ["General_Inflation", "Other"]
    )))
    inflation_indicator_counts = {indicator: 0 for indicator in inflation_indicator_list}

    sentence_data_list = []

    for sent_idx, sentence in enumerate(sentences):
        classification_result = classify_sentence(sentence)

        labor_indicators_filtered = [ind for ind in classification_result['labor_indicators']
                                      if ind != "General Labor"]
        inflation_indicators_filtered = [ind for ind in classification_result['inflation_indicators']
                                          if ind not in ["General_Inflation", "Other"]]

        sentence_data = {
            'sentence_number': sent_idx + 1,
            'sentence_text': sentence,
            'classification': classification_result['classification'],
            'labor_indicators': ', '.join(sorted(labor_indicators_filtered)) if labor_indicators_filtered else '',
            'inflation_indicators': ', '.join(sorted(inflation_indicators_filtered)) if inflation_indicators_filtered else ''
        }
        sentence_data_list.append(sentence_data)

        labor_specific_found = bool(classification_result['labor_indicators'])
        labor_general_found = check_general_labor_term(sentence)
        labor_found = labor_specific_found or labor_general_found

        inflation_specific_found = bool(classification_result['inflation_indicators'])
        inflation_general_found = check_general_inflation_terms(sentence)
        inflation_found = inflation_specific_found or inflation_general_found

        if labor_found and inflation_found:
            both_sentences += 1
            labor_sentences += 1
            inflation_sentences += 1
        elif labor_found:
            labor_sentences += 1
        elif inflation_found:
            inflation_sentences += 1

        for indicator in classification_result['labor_indicators']:
            if indicator in labor_indicator_counts:
                labor_indicator_counts[indicator] += 1

        for indicator in classification_result['inflation_indicators']:
            if indicator in inflation_indicator_counts:
                inflation_indicator_counts[indicator] += 1

    total_labor_mentions = sum(labor_indicator_counts.values())
    total_inflation_mentions = sum(inflation_indicator_counts.values())

    labor_emphasis = {}
    for indicator, count in labor_indicator_counts.items():
        labor_emphasis[f"labor_emphasis_{indicator}"] = count / total_labor_mentions if total_labor_mentions > 0 else 0

    inflation_emphasis = {}
    for indicator, count in inflation_indicator_counts.items():
        inflation_emphasis[f"inflation_emphasis_{indicator}"] = count / total_inflation_mentions if total_inflation_mentions > 0 else 0

    labor_sentence_share = {}
    for indicator, count in labor_indicator_counts.items():
        labor_sentence_share[f"labor_share_total_sentences_{indicator}"] = count / total_sentences if total_sentences > 0 else 0

    inflation_sentence_share = {}
    for indicator, count in inflation_indicator_counts.items():
        inflation_sentence_share[f"inflation_share_total_sentences_{indicator}"] = count / total_sentences if total_sentences > 0 else 0

    labor_inflation_total = labor_sentences + inflation_sentences - both_sentences
    labor_share_of_labor_inflation = labor_sentences / labor_inflation_total if labor_inflation_total > 0 else 0

    summary_results = {
        'sentences_on_labor': labor_sentences,
        'sentences_on_inflation': inflation_sentences,
        'sentences_on_both': both_sentences,
        'total_sentences': total_sentences,
        'labor_share_of_labor_inflation_sentences': labor_share_of_labor_inflation
    }

    for indicator, count in labor_indicator_counts.items():
        summary_results[f'labor_{indicator}_count'] = count

    for indicator, count in inflation_indicator_counts.items():
        summary_results[f'inflation_{indicator}_count'] = count

    summary_results.update(labor_emphasis)
    summary_results.update(inflation_emphasis)
    summary_results.update(labor_sentence_share)
    summary_results.update(inflation_sentence_share)

    return summary_results, sentence_data_list

# ============================================================================
# STEP 4: CLASSIFY SPEECH CONTENT
# ============================================================================

print("\n" + "="*70)
print("CLASSIFYING SPEECH CONTENT")
print("="*70)

cleaned_files = glob(os.path.join(cleaned_output_dir, '*.csv'))
print(f"Found {len(cleaned_files)} cleaned speech files to classify")

results_list = []
all_sentences = []

for idx, csv_file in enumerate(cleaned_files):
    filename = os.path.basename(csv_file)

    if idx % 5 == 0 or len(cleaned_files) <= 10:
        print(f"Processing file {idx+1}/{len(cleaned_files)}: {filename}")

    try:
        df = pd.read_csv(csv_file, encoding='utf-8', encoding_errors='replace')

        text_col = None
        for col in df.columns:
            if 'text' in col.lower():
                text_col = col
                break

        if text_col is None:
            print(f"  Warning: No text column in {filename}, skipping...")
            continue

        for row_idx, row in df.iterrows():
            if pd.isna(row[text_col]):
                text = ''
            else:
                text = str(row[text_col])

            if len(text.strip()) == 0:
                continue

            summary_results, sentence_data_list = analyze_speech(text)

            for col in df.columns:
                col_lower = col.lower()
                if col_lower not in ['text', 'speech_text']:
                    summary_results[col] = str(row[col]) if pd.notna(row[col]) else ''

            results_list.append(summary_results)

            for sentence_data in sentence_data_list:
                official_name = ''
                for name_col in ['official_name', 'name', 'Name', 'speaker', 'Speaker']:
                    if name_col in row and pd.notna(row[name_col]):
                        official_name = str(row[name_col])
                        break

                sentence_data['official_name'] = official_name

                date_val = ''
                for date_col in ['date', 'Date']:
                    if date_col in row and pd.notna(row[date_col]):
                        date_val = str(row[date_col])
                        break
                sentence_data['date'] = date_val

                all_sentences.append(sentence_data)

    except Exception as e:
        print(f"  Error processing {filename}: {e}")
        import traceback
        print(f"  Full traceback: {traceback.format_exc()}")
        continue

# Create summary dataframe
results_df = pd.DataFrame(results_list)

if len(results_df) > 0:
    priority_cols = ['date', 'Date', 'official_name', 'name', 'Name', 'title', 'Title']
    first_cols = [col for col in priority_cols if col in results_df.columns]
    other_cols = [col for col in results_df.columns if col not in first_cols]
    results_df = results_df[first_cols + other_cols]

    date_col = None
    for col in ['date', 'Date']:
        if col in results_df.columns:
            date_col = col
            break

    if date_col:
        try:
            results_df = results_df.sort_values(date_col)
        except:
            pass

    summary_output_file = os.path.join(summary_output_dir, 'speeches_content.csv')
    results_df.to_csv(summary_output_file, index=False)
    print(f"\nSummary dataset saved to: {summary_output_file}")
    print(f"Shape: {results_df.shape}")

    print("\n" + "="*70)
    print("SUMMARY STATISTICS")
    print("="*70)
    print(f"\nNumber of speeches analyzed: {len(results_df)}")
    print(f"\nAverage sentences per speech: {results_df['total_sentences'].mean():.1f}")
    print(f"Average labor sentences: {results_df['sentences_on_labor'].mean():.1f}")
    print(f"Average inflation sentences: {results_df['sentences_on_inflation'].mean():.1f}")
    print(f"Average sentences on both: {results_df['sentences_on_both'].mean():.1f}")
    print(f"Average labor share of labor/inflation: {results_df['labor_share_of_labor_inflation_sentences'].mean():.2%}")

    labor_emphasis_cols = [col for col in results_df.columns if col.startswith('labor_emphasis_')]
    print("\n" + "-"*70)
    print("AVERAGE LABOR EMPHASIS VECTORS")
    print("-"*70)
    for col in sorted(labor_emphasis_cols):
        indicator_name = col.replace('labor_emphasis_', '')
        avg_emphasis = results_df[col].mean()
        print(f"{indicator_name:20s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")

    total_labor_emphasis = results_df[labor_emphasis_cols].mean().sum()
    print(f"\n{'Total':20s}: {total_labor_emphasis:.4f}")

    inflation_emphasis_cols = [col for col in results_df.columns if col.startswith('inflation_emphasis_')]
    print("\n" + "-"*70)
    print("AVERAGE INFLATION EMPHASIS VECTORS")
    print("-"*70)
    for col in sorted(inflation_emphasis_cols):
        indicator_name = col.replace('inflation_emphasis_', '')
        avg_emphasis = results_df[col].mean()
        print(f"{indicator_name:20s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")

    total_inflation_emphasis = results_df[inflation_emphasis_cols].mean().sum()
    print(f"\n{'Total':20s}: {total_inflation_emphasis:.4f}")

    print("\n" + "-"*70)
    print("OFFICIALS BY LABOR SHARE OF LABOR/INFLATION")
    print("-"*70)

    official_col = None
    for col in ['official_name', 'name', 'Name', 'speaker', 'Speaker']:
        if col in results_df.columns:
            official_col = col
            break

    if official_col:
        official_labor_share = results_df.groupby(official_col)['labor_share_of_labor_inflation_sentences'].agg(['mean', 'count'])
        official_labor_share = official_labor_share[official_labor_share['count'] >= 2]
        official_labor_share = official_labor_share.sort_values('mean', ascending=False)

        if len(official_labor_share) >= 4:
            print("\nTop 4 Officials (Highest Labor Share):")
            top_4 = official_labor_share.head(4)
            for idx, (official, row) in enumerate(top_4.iterrows(), 1):
                print(f"{idx}. {official:30s}: {row['mean']:.2%} (n={int(row['count'])})")

            print("\nBottom 4 Officials (Lowest Labor Share):")
            bottom_4 = official_labor_share.tail(4)
            for idx, (official, row) in enumerate(bottom_4.iterrows(), 1):
                print(f"{idx}. {official:30s}: {row['mean']:.2%} (n={int(row['count'])})")

            print("\n" + "="*70)
            print("EMPHASIS VECTORS FOR TOP 4 OFFICIALS")
            print("="*70)

            for official_name in top_4.index:
                official_speeches = results_df[results_df[official_col] == official_name]

                print(f"\n{official_name} (Labor Share: {top_4.loc[official_name, 'mean']:.2%})")
                print("-" * 70)

                print("Labor Emphasis:")
                for col in sorted(labor_emphasis_cols):
                    indicator_name = col.replace('labor_emphasis_', '')
                    avg_emphasis = official_speeches[col].mean()
                    if avg_emphasis > 0:
                        print(f"  {indicator_name:25s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")

                print("\nInflation Emphasis:")
                for col in sorted(inflation_emphasis_cols):
                    indicator_name = col.replace('inflation_emphasis_', '')
                    avg_emphasis = official_speeches[col].mean()
                    if avg_emphasis > 0:
                        print(f"  {indicator_name:25s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")

            print("\n" + "="*70)
            print("EMPHASIS VECTORS FOR BOTTOM 4 OFFICIALS")
            print("="*70)

            for official_name in bottom_4.index:
                official_speeches = results_df[results_df[official_col] == official_name]

                print(f"\n{official_name} (Labor Share: {bottom_4.loc[official_name, 'mean']:.2%})")
                print("-" * 70)

                print("Labor Emphasis:")
                for col in sorted(labor_emphasis_cols):
                    indicator_name = col.replace('labor_emphasis_', '')
                    avg_emphasis = official_speeches[col].mean()
                    if avg_emphasis > 0:
                        print(f"  {indicator_name:25s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")

                print("\nInflation Emphasis:")
                for col in sorted(inflation_emphasis_cols):
                    indicator_name = col.replace('inflation_emphasis_', '')
                    avg_emphasis = official_speeches[col].mean()
                    if avg_emphasis > 0:
                        print(f"  {indicator_name:25s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")
        else:
            print(f"\nNot enough officials with multiple speeches (found {len(official_labor_share)})")
    else:
        print("\nCould not find official name column")

else:
    print("\nWarning: No results to save!")

# Create sentence-level dataframe and validation set
sentences_df = pd.DataFrame(all_sentences)

if len(sentences_df) > 0:
    print(f"\nTotal sentences extracted: {len(sentences_df)}")
    print("\nClassification distribution:")
    print(sentences_df['classification'].value_counts())

    n_labor = 15
    n_inflation = 15
    n_both = 5
    n_neither = 10

    print(f"\nSampling sentences for validation...")
    validation_samples = []

    labor_sentences = sentences_df[sentences_df['classification'] == 'Labor']
    if len(labor_sentences) >= n_labor:
        validation_samples.append(labor_sentences.sample(n=n_labor, random_state=seed))
    else:
        print(f"Warning: Only {len(labor_sentences)} labor sentences available")
        if len(labor_sentences) > 0:
            validation_samples.append(labor_sentences)

    inflation_sentences = sentences_df[sentences_df['classification'] == 'Inflation']
    if len(inflation_sentences) >= n_inflation:
        validation_samples.append(inflation_sentences.sample(n=n_inflation, random_state=seed))
    else:
        print(f"Warning: Only {len(inflation_sentences)} inflation sentences available")
        if len(inflation_sentences) > 0:
            validation_samples.append(inflation_sentences)

    both_sentences = sentences_df[sentences_df['classification'] == 'Both']
    if len(both_sentences) >= n_both:
        validation_samples.append(both_sentences.sample(n=n_both, random_state=seed))
    else:
        print(f"Warning: Only {len(both_sentences)} both sentences available")
        if len(both_sentences) > 0:
            validation_samples.append(both_sentences)

    neither_sentences = sentences_df[sentences_df['classification'] == 'Neither']
    if len(neither_sentences) >= n_neither:
        validation_samples.append(neither_sentences.sample(n=n_neither, random_state=seed))
    else:
        print(f"Warning: Only {len(neither_sentences)} neither sentences available")
        if len(neither_sentences) > 0:
            validation_samples.append(neither_sentences)

    if validation_samples:
        validation_df = pd.concat(validation_samples, ignore_index=True)
        validation_df = validation_df.sample(frac=1, random_state=seed).reset_index(drop=True)

        validation_output_file = os.path.join(validation_output_dir, 'speeches_validate.csv')
        validation_df.to_csv(validation_output_file, index=False)

        print(f"\nValidation set created: {validation_output_file}")
        print(f"Total sentences in validation set: {len(validation_df)}")
        print(f"\nValidation set distribution:")
        print(validation_df['classification'].value_counts())

        print("\n" + "="*70)
        print("SAMPLE VALIDATION SENTENCES (10 examples)")
        print("="*70)

        sample_display = validation_df.head(10)
        for idx, row in sample_display.iterrows():
            print(f"\n[{idx+1}] Classification: {row['classification']}")
            if row['official_name']:
                print(f"    Official: {row['official_name']}")
            if row['labor_indicators']:
                print(f"    Labor Indicators: {row['labor_indicators']}")
            if row['inflation_indicators']:
                print(f"    Inflation Indicators: {row['inflation_indicators']}")
            print(f"    Sentence: {row['sentence_text'][:200]}{'...' if len(row['sentence_text']) > 200 else ''}")
    else:
        print("\nNo validation samples available")

print("\n" + "="*70)
print("PROCESSING COMPLETE!")
print("="*70)

Mounted at /content/drive
Current working directory: /content/drive/MyDrive/FedComs/Speeches

Cleaning speech files...
Reading from: /content/drive/MyDrive/FedComs/Speeches/fed_speeches
Found 25 speech files
Processing susan_bies_speeches.csv...
Processing frederic_mishkin_speeches.csv...
Processing ben_bernanke_speeches.csv...
Processing donald_kohn_speeches.csv...
Processing mark_olson_speeches.csv...
Processing roger_jr_speeches.csv...
Processing kevin_warsh_speeches.csv...
Processing randall_kroszner_speeches.csv...
Processing sarah_raskin_speeches.csv...
Processing janet_yellen_speeches.csv...
Processing elizabeth_duke_speeches.csv...
Processing daniel_tarullo_speeches.csv...
Processing jeremy_stein_speeches.csv...
Processing jerome_powell_speeches.csv...
Processing stanley_fischer_speeches.csv...
Processing lael_brainard_speeches.csv...
Processing randal_quarles_speeches.csv...
Processing richard_clarida_speeches.csv...
Processing michelle_bowman_speeches.csv...
Processing christ