In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import os
dict_dir = '/content/drive/MyDrive/FedComs/Dictionaries'
input_file = '/content/drive/MyDrive/FedComs/Statements/fomc_statements.csv'
cleaned_file = '/content/drive/MyDrive/FedComs/Statements/fomc_statements_cleaned.csv'
output_dir = '/content/drive/MyDrive/Statements'
validation_dir = '/content/drive/MyDrive/FedComs/Validation_Sets'

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

os.chdir(output_dir)

# Verify the current working directory
print(f"Current working directory: {os.getcwd()}")

Mounted at /content/drive
Current working directory: /content/drive/MyDrive/Statements


In [None]:
#@title Step 1: Clean
import pandas as pd
import re
import os

# Define paths
input_file = '/content/drive/MyDrive/FedComs/Statements/fomc_statements.csv'
output_dir = '/content/drive/MyDrive/Statements'

# Read the original statements
print("Reading FOMC statements...")
df = pd.read_csv(input_file)
print(f"Loaded {len(df)} statements")

# Delete statement_20081216
print("\nDeleting statement_20081216...")
df = df[df['id'] != 'statement_20081216'].copy()
print(f"Statements remaining: {len(df)}")

# Convert date to datetime for easier filtering
df['date'] = pd.to_datetime(df['date'])

def clean_text_old(text, date):
    """Clean text for statements from 2005-12-13 and prior."""
    # Delete text before and including "For immediate release"
    match = re.search(r'For immediate release', text, re.IGNORECASE)
    if match:
        text = text[match.end():]

    # Delete text after and including "YYYY Monetary policy" where YYYY is the year
    year = date.year
    pattern = rf'{year}\s+Monetary policy'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        text = text[:match.start()]

    return text.strip()

def clean_text_middle(text):
    """Clean text for statements from 2006-01-31 to 2020-03-05."""
    # Delete text before and including the first mention of "share"
    match = re.search(r'share', text, re.IGNORECASE)
    if match:
        text = text[match.end():]

    # Delete text after and including "Last Update"
    match = re.search(r'Last Update', text, re.IGNORECASE)
    if match:
        text = text[:match.start()]

    return text.strip()

def clean_text_recent(text):
    """Clean text for statements after 2020-03-05."""
    # Find "For release" first
    for_release_match = re.search(r'For release', text, re.IGNORECASE)

    if for_release_match:
        # Look for "share" after "For release"
        text_after_release = text[for_release_match.end():]
        share_match = re.search(r'share', text_after_release, re.IGNORECASE)

        if share_match:
            # Calculate position in original text
            start_pos = for_release_match.end() + share_match.end()
            text = text[start_pos:]
    else:
        # If no "For release" found, just look for "share"
        match = re.search(r'share', text, re.IGNORECASE)
        if match:
            text = text[match.end():]

    # Delete text after and including "Last Update"
    match = re.search(r'Last Update', text, re.IGNORECASE)
    if match:
        text = text[:match.start()]

    # Delete text after and including "For media inquiries"
    match = re.search(r'For media inquiries', text, re.IGNORECASE)
    if match:
        text = text[:match.start()]

    return text.strip()

# Apply cleaning based on date
print("\nCleaning statements...")
cleaned_texts = []

for idx, row in df.iterrows():
    if idx % 20 == 0:
        print(f"Processing statement {idx+1}/{len(df)}...")

    date = row['date']
    text = row['text']

    # Apply appropriate cleaning function based on date
    if date <= pd.Timestamp('2005-12-13'):
        cleaned_text = clean_text_old(text, date)
    elif date <= pd.Timestamp('2020-03-05'):
        cleaned_text = clean_text_middle(text)
    else:
        cleaned_text = clean_text_recent(text)

    cleaned_texts.append(cleaned_text)

# Create new dataframe with cleaned text
df_cleaned = df.copy()
df_cleaned['text'] = cleaned_texts

# Convert date back to string format to match original
df_cleaned['date'] = df_cleaned['date'].dt.strftime('%Y-%m-%d')

# Save cleaned statements
output_file = os.path.join(output_dir, 'fomc_statements_cleaned.csv')
df_cleaned.to_csv(output_file, index=False)

print(f"\nCleaned statements saved to: {output_file}")
print(f"Total statements: {len(df_cleaned)}")

# Display some statistics about the cleaning
print("\n" + "="*70)
print("CLEANING STATISTICS")
print("="*70)

# Calculate average text length before and after
original_lengths = df['text'].str.len()
cleaned_lengths = df_cleaned['text'].str.len()

print(f"\nAverage original text length: {original_lengths.mean():.0f} characters")
print(f"Average cleaned text length: {cleaned_lengths.mean():.0f} characters")
print(f"Average reduction: {(original_lengths.mean() - cleaned_lengths.mean()):.0f} characters ({((1 - cleaned_lengths.mean()/original_lengths.mean())*100):.1f}%)")

# Show examples from each period
print("\n" + "="*70)
print("EXAMPLES OF CLEANED TEXT")
print("="*70)

# Example from old period (2005 and prior)
old_example = df_cleaned[df_cleaned['date'] <= '2005-12-13'].iloc[-1]
print(f"\nOLD PERIOD (2005 and prior)")
print(f"Date: {old_example['date']}")
print(f"First 200 characters: {old_example['text'][:200]}...")

# Example from middle period (2006-2020-03-05)
middle_example = df_cleaned[(df_cleaned['date'] > '2005-12-13') & (df_cleaned['date'] <= '2020-03-05')].iloc[0]
print(f"\nMIDDLE PERIOD (2006 to March 2020)")
print(f"Date: {middle_example['date']}")
print(f"First 200 characters: {middle_example['text'][:200]}...")

# Example from recent period (after 2020-03-05)
recent_example = df_cleaned[df_cleaned['date'] > '2020-03-05'].iloc[0]
print(f"\nRECENT PERIOD (after March 2020)")
print(f"Date: {recent_example['date']}")
print(f"First 200 characters: {recent_example['text'][:200]}...")

print("\n" + "="*70)
print("Cleaning complete!")

Reading FOMC statements...
Loaded 199 statements

Deleting statement_20081216...
Statements remaining: 198

Cleaning statements...
Processing statement 1/198...
Processing statement 21/198...
Processing statement 41/198...
Processing statement 61/198...
Processing statement 81/198...
Processing statement 101/198...
Processing statement 121/198...
Processing statement 141/198...
Processing statement 161/198...
Processing statement 181/198...

Cleaned statements saved to: /content/drive/MyDrive/Statements/fomc_statements_cleaned.csv
Total statements: 198

CLEANING STATISTICS

Average original text length: 4247 characters
Average cleaned text length: 2621 characters
Average reduction: 1627 characters (38.3%)

EXAMPLES OF CLEANED TEXT

OLD PERIOD (2005 and prior)
Date: 2005-12-13
First 200 characters: The Federal Open Market Committee decided today to raise its target for the federal funds rate by 25 basis points to 4-1/4 percent.
Despite elevated energy prices and hurricane-related disrup

In [None]:
import pandas as pd
import numpy as np
import re
import json
import random
import os
from collections import defaultdict

import time
seed = int(time.time())  # Use current timestamp as seed
random.seed(seed)
np.random.seed(seed)

# Define paths
dict_dir = '/content/drive/MyDrive/FedComs/Dictionaries'
input_file = '/content/drive/MyDrive/Statements/fomc_statements_cleaned.csv'
output_dir = '/content/drive/MyDrive/Statements'
validation_dir = '/content/drive/MyDrive/FedComs/Validation_Sets'

# Ensure output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load dictionaries
print("\nLoading dictionaries...")
with open(os.path.join(dict_dir, 'labor_indicators.json'), 'r') as f:
    LABOR_INDICATORS = json.load(f)

with open(os.path.join(dict_dir, 'inflation_indicators.json'), 'r') as f:
    INFLATION_INDICATORS = json.load(f)

with open(os.path.join(dict_dir, 'inflation_pattern_mapping.json'), 'r') as f:
    INFLATION_PATTERN_TO_INDICATOR = json.load(f)

print("Dictionaries loaded successfully!")
print(f"Labor indicators: {list(LABOR_INDICATORS.keys())}")
print(f"Inflation categories: {list(INFLATION_INDICATORS.keys())}")

# Analysis functions
def split_into_sentences(text):
    """Split text into sentences, preserving initials and abbreviations."""
    # Step 1: Protect known abbreviations
    abbreviations = [
        r'\bSt\.', r'\bU\.S\.', r'\bMr\.', r'\bMrs\.', r'\bMs\.', r'\bDr\.',
        r'\bProf\.', r'\bSr\.', r'\bJr\.', r'\bvs\.', r'\betc\.', r'\bi\.e\.', r'\be\.g\.'
    ]
    for abbr in abbreviations:
        text = re.sub(abbr, lambda m: m.group(0).replace('.', '<ABBR_PERIOD>'), text)

    # Step 2: Protect initials in names (e.g., "Jerome H. Powell")
    text = re.sub(
        r'\b([A-Z]\.)(\s+[A-Z]\.)?(?:\s+[A-Z][a-z]+)+',
        lambda m: m.group(0).replace('.', '<NAME_PERIOD>'),
        text
    )

    # Step 3: Protect lists of officials (e.g., "Voting for ... Jerome H. Powell, ...")
    voting_pattern = r'((?:Voting for|Voting against)\s+[^.!?]+?)([.!?]+\s+|$)'
    voting_matches = []
    def store_voting_match(match):
        voting_matches.append(match.group(1))
        return '<VOTING_LIST_{}>'.format(len(voting_matches) - 1)
    text = re.sub(voting_pattern, store_voting_match, text)

    # Step 4: Split into sentences
    sentences = re.split(r'[.!?]+\s+', text)
    sentences = [s.strip() for s in sentences if s.strip()]

    # Step 5: Restore protected periods and voting lists
    restored_sentences = []
    for sentence in sentences:
        sentence = sentence.replace('<ABBR_PERIOD>', '.')
        sentence = sentence.replace('<NAME_PERIOD>', '.')
        for i, voting_list in enumerate(voting_matches):
            placeholder = f'<VOTING_LIST_{i}>'
            if placeholder in sentence:
                sentence = sentence.replace(placeholder, voting_list)
        restored_sentences.append(sentence)

    return restored_sentences

def check_keywords_in_sentence(sentence, keywords):
    """Check if any keyword appears in the sentence."""
    sentence_lower = sentence.lower()
    for keyword in keywords:
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        if re.search(pattern, sentence_lower):
            return True
    return False

def check_employment_indicator(sentence, keywords):
    """Check for Employment indicator, excluding maximum/full employment."""
    sentence_lower = sentence.lower()

    # Check if sentence contains maximum employment, full employment, or employment goal
    if re.search(r'\b(?:maximum|full)\s+employment\b', sentence_lower):
        return False
    if re.search(r'\bemployment\s+goal\b', sentence_lower):
        return False

    # Otherwise check for employment keywords normally
    for keyword in keywords:
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        if re.search(pattern, sentence_lower):
            return True
    return False

def check_general_labor_term(sentence):
    """Check if sentence contains general labor terms, including 'maximum employment'."""
    sentence_lower = sentence.lower()
    general_labor_keywords = LABOR_INDICATORS.get("General Labor", [])
    for keyword in general_labor_keywords:
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        if re.search(pattern, sentence_lower):
            return True
    return False

def check_general_inflation_terms(sentence):
    """Check if sentence contains general inflation terms."""
    sentence_lower = sentence.lower()
    general_inflation_patterns = INFLATION_INDICATORS.get("General Inflation", {}).get("general_patterns", [])
    for pattern in general_inflation_patterns:
        if re.search(pattern, sentence_lower, re.IGNORECASE):
            return True
    return False

def check_inflation_sentence(sentence):
    """Check if sentence mentions any inflation indicator and which ones."""
    mentioned_indicators = set()
    sentence_lower = sentence.lower()

    for category, subcategories in INFLATION_INDICATORS.items():
        for pattern_name, pattern_list in subcategories.items():
            for pattern in pattern_list:
                if re.search(pattern, sentence_lower, re.IGNORECASE):
                    indicator_name = INFLATION_PATTERN_TO_INDICATOR.get(pattern_name, "Other")
                    mentioned_indicators.add(indicator_name)
                    break

    # If sentence has both Core_CPI and Core, remove the generic Core
    if "Core_CPI" in mentioned_indicators and "Core" in mentioned_indicators:
        mentioned_indicators.discard("Core")

    # If sentence has both Core_PCE and Core, remove the generic Core
    if "Core_PCE" in mentioned_indicators and "Core" in mentioned_indicators:
        mentioned_indicators.discard("Core")

    # If sentence has both Headline_CPI and Headline, remove the generic Headline
    if "Headline_CPI" in mentioned_indicators and "Headline" in mentioned_indicators:
        mentioned_indicators.discard("Headline")

    # If sentence has both Headline_PCE and Headline, remove the generic Headline
    if "Headline_PCE" in mentioned_indicators and "Headline" in mentioned_indicators:
        mentioned_indicators.discard("Headline")

    return mentioned_indicators

def classify_sentence(sentence):
    """Classify a single sentence and return its indicators."""
    labor_specific_found = False
    labor_indicators_in_sentence = set()

    # Check all labor indicators EXCEPT "General Labor"
    for indicator, keywords in LABOR_INDICATORS.items():
        if indicator == "General Labor":
            continue  # Skip general labor for indicator counts

        # Use special handling for Employment indicator
        if indicator == "Employment":
            if check_employment_indicator(sentence, keywords):
                labor_indicators_in_sentence.add(indicator)
                labor_specific_found = True
        else:
            if check_keywords_in_sentence(sentence, keywords):
                labor_indicators_in_sentence.add(indicator)
                labor_specific_found = True

    labor_general_found = check_general_labor_term(sentence)
    labor_found = labor_specific_found or labor_general_found

    inflation_indicators_in_sentence = check_inflation_sentence(sentence)
    inflation_specific_found = bool(inflation_indicators_in_sentence)

    inflation_general_found = check_general_inflation_terms(sentence)
    inflation_found = inflation_specific_found or inflation_general_found

    if labor_found and inflation_found:
        classification = "Both"
    elif labor_found:
        classification = "Labor"
    elif inflation_found:
        classification = "Inflation"
    else:
        classification = "Neither"

    return {
        'classification': classification,
        'labor_indicators': list(labor_indicators_in_sentence),
        'inflation_indicators': list(inflation_indicators_in_sentence)
    }

def analyze_statement(text):
    """Analyze a single statement for labor and inflation content."""
    sentences = split_into_sentences(text)
    total_sentences = len(sentences)

    labor_sentences = 0
    inflation_sentences = 0
    both_sentences = 0

    # Only create indicator counts for non-general categories
    labor_indicator_counts = {indicator: 0 for indicator in LABOR_INDICATORS.keys() if indicator != "General Labor"}
    inflation_indicator_list = sorted(list(set(
        indicator for indicator in INFLATION_PATTERN_TO_INDICATOR.values()
        if indicator not in ["General_Inflation", "Other"]
    )))
    inflation_indicator_counts = {indicator: 0 for indicator in inflation_indicator_list}

    sentence_data_list = []

    for sent_idx, sentence in enumerate(sentences):
        classification_result = classify_sentence(sentence)

        # Filter out general categories from the indicator lists
        labor_indicators_filtered = [ind for ind in classification_result['labor_indicators']
                                      if ind != "General Labor"]
        inflation_indicators_filtered = [ind for ind in classification_result['inflation_indicators']
                                          if ind not in ["General_Inflation", "Other"]]

        sentence_data = {
            'sentence_number': sent_idx + 1,
            'sentence_text': sentence,
            'classification': classification_result['classification'],
            'labor_indicators': ', '.join(sorted(labor_indicators_filtered)) if labor_indicators_filtered else '',
            'inflation_indicators': ', '.join(sorted(inflation_indicators_filtered)) if inflation_indicators_filtered else ''
        }
        sentence_data_list.append(sentence_data)

        labor_specific_found = bool(classification_result['labor_indicators'])
        labor_general_found = check_general_labor_term(sentence)
        labor_found = labor_specific_found or labor_general_found

        inflation_specific_found = bool(classification_result['inflation_indicators'])
        inflation_general_found = check_general_inflation_terms(sentence)
        inflation_found = inflation_specific_found or inflation_general_found

        if labor_found and inflation_found:
            both_sentences += 1
            labor_sentences += 1
            inflation_sentences += 1
        elif labor_found:
            labor_sentences += 1
        elif inflation_found:
            inflation_sentences += 1

        # Only count specific indicators (not general terms or "Other") for emphasis vectors
        for indicator in classification_result['labor_indicators']:
            if indicator in labor_indicator_counts:
                labor_indicator_counts[indicator] += 1

        for indicator in classification_result['inflation_indicators']:
            if indicator in inflation_indicator_counts:
                inflation_indicator_counts[indicator] += 1

    total_labor_mentions = sum(labor_indicator_counts.values())
    total_inflation_mentions = sum(inflation_indicator_counts.values())

    labor_emphasis = {}
    for indicator, count in labor_indicator_counts.items():
        labor_emphasis[f"labor_emphasis_{indicator}"] = count / total_labor_mentions if total_labor_mentions > 0 else 0

    inflation_emphasis = {}
    for indicator, count in inflation_indicator_counts.items():
        inflation_emphasis[f"inflation_emphasis_{indicator}"] = count / total_inflation_mentions if total_inflation_mentions > 0 else 0

    labor_sentence_share = {}
    for indicator, count in labor_indicator_counts.items():
        labor_sentence_share[f"labor_share_total_sentences_{indicator}"] = count / total_sentences if total_sentences > 0 else 0

    inflation_sentence_share = {}
    for indicator, count in inflation_indicator_counts.items():
        inflation_sentence_share[f"inflation_share_total_sentences_{indicator}"] = count / total_sentences if total_sentences > 0 else 0

    labor_inflation_total = labor_sentences + inflation_sentences - both_sentences
    labor_share_of_labor_inflation = labor_sentences / labor_inflation_total if labor_inflation_total > 0 else 0

    summary_results = {
        'sentences_on_labor': labor_sentences,
        'sentences_on_inflation': inflation_sentences,
        'sentences_on_both': both_sentences,
        'total_sentences': total_sentences,
        'labor_share_of_labor_inflation_sentences': labor_share_of_labor_inflation
    }

    for indicator, count in labor_indicator_counts.items():
        summary_results[f'labor_{indicator}_count'] = count

    for indicator, count in inflation_indicator_counts.items():
        summary_results[f'inflation_{indicator}_count'] = count

    summary_results.update(labor_emphasis)
    summary_results.update(inflation_emphasis)
    summary_results.update(labor_sentence_share)
    summary_results.update(inflation_sentence_share)

    return summary_results, sentence_data_list

# Read the cleaned statements
print("\nReading cleaned FOMC statements...")
df = pd.read_csv(input_file)
print(f"Loaded {len(df)} statements")

# Process each statement
print("\nAnalyzing statements...")
results_list = []
all_sentences = []

for idx, row in df.iterrows():
    if idx % 10 == 0:
        print(f"Processing statement {idx+1}/{len(df)}...")

    summary_results, sentence_data_list = analyze_statement(row['text'])
    summary_results['date'] = row['date']
    summary_results['id'] = row['id']
    results_list.append(summary_results)

    for sentence_data in sentence_data_list:
        sentence_data['statement_date'] = row['date']
        sentence_data['statement_id'] = row['id']
        all_sentences.append(sentence_data)

# Create summary dataframe (training set)
results_df = pd.DataFrame(results_list)
cols = ['id', 'date'] + [col for col in results_df.columns if col not in ['id', 'date']]
results_df = results_df[cols]
results_df = results_df.sort_values('date')

# Save summary dataset
summary_output_file = os.path.join(output_dir, 'statement_content.csv')
results_df.to_csv(summary_output_file, index=False)
print(f"\nSummary dataset saved to: {summary_output_file}")
print(f"Shape: {results_df.shape}")

# Create sentence-level dataframe (all sentences)
sentences_df = pd.DataFrame(all_sentences)
print(f"\nTotal sentences extracted: {len(sentences_df)}")

# Print classification distribution
print("\nClassification distribution:")
print(sentences_df['classification'].value_counts())

# Sample sentences for validation
n_labor = 15
n_inflation = 15
n_both = 5
n_neither = 10

print(f"\nSampling sentences for validation...")
print(f"  - {n_labor} Labor sentences")
print(f"  - {n_inflation} Inflation sentences")
print(f"  - {n_both} Both sentences")
print(f"  - {n_neither} Neither sentences")

validation_samples = []

labor_sentences = sentences_df[sentences_df['classification'] == 'Labor']
if len(labor_sentences) >= n_labor:
    validation_samples.append(labor_sentences.sample(n=n_labor, random_state=seed))
else:
    print(f"Warning: Only {len(labor_sentences)} labor sentences available")
    validation_samples.append(labor_sentences)

inflation_sentences = sentences_df[sentences_df['classification'] == 'Inflation']
if len(inflation_sentences) >= n_inflation:
    validation_samples.append(inflation_sentences.sample(n=n_inflation, random_state=seed))
else:
    print(f"Warning: Only {len(inflation_sentences)} inflation sentences available")
    validation_samples.append(inflation_sentences)

both_sentences = sentences_df[sentences_df['classification'] == 'Both']
if len(both_sentences) >= n_both:
    validation_samples.append(both_sentences.sample(n=n_both, random_state=seed))
else:
    print(f"Warning: Only {len(both_sentences)} both sentences available")
    validation_samples.append(both_sentences)

neither_sentences = sentences_df[sentences_df['classification'] == 'Neither']
if len(neither_sentences) >= n_neither:
    validation_samples.append(neither_sentences.sample(n=n_neither, random_state=seed))
else:
    print(f"Warning: Only {len(neither_sentences)} neither sentences available")
    validation_samples.append(neither_sentences)

validation_df = pd.concat(validation_samples, ignore_index=True)
validation_df = validation_df.sample(frac=1, random_state=seed).reset_index(drop=True)

# Save validation dataset
validation_output_file = os.path.join(validation_dir, 'statement_validate.csv')
validation_df.to_csv(validation_output_file, index=False)

print(f"\nValidation set created: {validation_output_file}")
print(f"Total sentences in validation set: {len(validation_df)}")
print(f"\nValidation set distribution:")
print(validation_df['classification'].value_counts())

# Summary statistics
results_df['date'] = pd.to_datetime(results_df['date'])
df_2010_plus = results_df[results_df['date'] >= '2010-01-01'].copy()

print("\n" + "="*70)
print("SUMMARY STATISTICS (2010-CURRENT)")
print("="*70)
print(f"\nNumber of statements: {len(df_2010_plus)}")
print(f"Date range: {df_2010_plus['date'].min().strftime('%Y-%m-%d')} to {df_2010_plus['date'].max().strftime('%Y-%m-%d')}")
print(f"\nAverage sentences per statement: {df_2010_plus['total_sentences'].mean():.1f}")
print(f"Average labor sentences: {df_2010_plus['sentences_on_labor'].mean():.1f}")
print(f"Average inflation sentences: {df_2010_plus['sentences_on_inflation'].mean():.1f}")
print(f"Average sentences on both: {df_2010_plus['sentences_on_both'].mean():.1f}")
print(f"Average labor share of labor/inflation: {df_2010_plus['labor_share_of_labor_inflation_sentences'].mean():.2%}")

labor_emphasis_cols = [col for col in results_df.columns if col.startswith('labor_emphasis_')]
print("\n" + "-"*70)
print("AVERAGE LABOR EMPHASIS VECTORS (2010-CURRENT)")
print("-"*70)
print("(Share of specific labor indicator mentions for each indicator)")
for col in sorted(labor_emphasis_cols):
    indicator_name = col.replace('labor_emphasis_', '')
    avg_emphasis = df_2010_plus[col].mean()
    print(f"{indicator_name:20s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")

total_labor_emphasis = df_2010_plus[labor_emphasis_cols].mean().sum()
print(f"\n{'Total':20s}: {total_labor_emphasis:.4f} (should be ~1.00)")

inflation_emphasis_cols = [col for col in results_df.columns if col.startswith('inflation_emphasis_')]
print("\n" + "-"*70)
print("AVERAGE INFLATION EMPHASIS VECTORS (2010-CURRENT)")
print("-"*70)
print("(Share of specific inflation indicator mentions for each indicator)")
for col in sorted(inflation_emphasis_cols):
    indicator_name = col.replace('inflation_emphasis_', '')
    avg_emphasis = df_2010_plus[col].mean()
    print(f"{indicator_name:20s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")

total_inflation_emphasis = df_2010_plus[inflation_emphasis_cols].mean().sum()
print(f"\n{'Total':20s}: {total_inflation_emphasis:.4f} (should be ~1.00)")

print("\n" + "="*70)
print("VALIDATION SET SUMMARY")
print("="*70)
summary_stats = {
    'total_sentences_in_corpus': len(sentences_df),
    'validation_set_size': len(validation_df),
    'labor_only_count': len(validation_df[validation_df['classification'] == 'Labor']),
    'inflation_only_count': len(validation_df[validation_df['classification'] == 'Inflation']),
    'both_count': len(validation_df[validation_df['classification'] == 'Both']),
    'neither_count': len(validation_df[validation_df['classification'] == 'Neither'])
}
for key, value in summary_stats.items():
    print(f"{key}: {value}")

print("\n" + "="*70)
print("SAMPLE OF VALIDATION SET (First 10 rows)")
print("="*70)
print(validation_df.head(10).to_string(index=True, max_colwidth=100))


Loading dictionaries...
Dictionaries loaded successfully!
Labor indicators: ['General Labor', 'Employment', 'Unemployment', 'Participation', 'Wages', 'Vacancies', 'Quits', 'Layoffs', 'Hiring']
Inflation categories: ['General Inflation', 'Core Measures', 'Headline Measures', 'Sectoral Measures', 'Producer Price Index', 'Wage Inflation', 'Inflation Expectations', 'Commodity Prices']

Reading cleaned FOMC statements...
Loaded 198 statements

Analyzing statements...
Processing statement 1/198...
Processing statement 11/198...
Processing statement 21/198...
Processing statement 31/198...
Processing statement 41/198...
Processing statement 51/198...
Processing statement 61/198...
Processing statement 71/198...
Processing statement 81/198...
Processing statement 91/198...
Processing statement 101/198...
Processing statement 111/198...
Processing statement 121/198...
Processing statement 131/198...
Processing statement 141/198...
Processing statement 151/198...
Processing statement 161/198...