In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import os
dict_dir = '/content/drive/MyDrive/FedComs/Dictionaries'
input_file = '/content/drive/MyDrive/FedComs/Minutes/fomc_minutes.csv'
cleaned_file = '/content/drive/MyDrive/FedComs/Minutes/fomc_minutes_cleaned.csv'
output_dir = '/content/drive/MyDrive/Minutes'
validation_dir = '/content/drive/MyDrive/FedComs/Validation_Sets'

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

os.chdir(output_dir)

# Verify the current working directory
print(f"Current working directory: {os.getcwd()}")

Mounted at /content/drive
Current working directory: /content/drive/MyDrive/Minutes


In [None]:
#@title Additional Clean
import pandas as pd
import re

input_file = '/content/drive/MyDrive/FedComs/Minutes/fomc_minutes.csv'
output_dir = '/content/drive/MyDrive/FedComs/Minutes'

print("Reading FOMC minutes...")
df = pd.read_csv(input_file)
print(f"Loaded {len(df)} minutes")

df['date'] = pd.to_datetime(df['date'])

def fix_text_encoding(text):
    """Fix common text encoding issues from web scraping."""
    text = text.replace('â', '—')
    text = text.replace('â', '—')
    text = text.replace('â', '"')
    text = text.replace('â', '"')
    text = text.replace('\u2013', '–')
    text = text.replace('\u2014', '—')
    text = text.replace('\u2018', "'")
    text = text.replace('\u2019', "'")
    text = text.replace('\u201c', '"')
    text = text.replace('\u201d', '"')
    text = text.replace('\u2026', '...')
    text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]', '', text)
    return text

def is_in_footnote(text, match_position):
    """Check if a match position is within a footnote."""
    start_check = max(0, match_position - 500)
    text_before = text[start_check:match_position]
    text_after = text[match_position:match_position + 200]

    if re.search(r'Return to text', text_before, re.IGNORECASE):
        if not re.search(r'Return to text', text_after, re.IGNORECASE):
            return True

    if re.search(r'\[\d+\]', text_before):
        return True

    if re.search(r'\d+\.\s+[A-Z]', text_before[-100:]):
        return True

    return False

def clean_text_minutes(text, date):
    """Clean text for FOMC minutes based on date."""
    text = fix_text_encoding(text)

    if date == pd.Timestamp('2021-11-03'):
        start_pattern = r'The manager turned first to a discussion'
        match = re.search(start_pattern, text, re.IGNORECASE)
        if match:
            sentence_start = text.rfind('.', 0, match.start())
            if sentence_start == -1:
                sentence_start = 0
            else:
                sentence_start += 1
            text = text[sentence_start:].strip()
    elif date < pd.Timestamp('2019-01-01'):
        pattern = r'Manager of the System Open'
        matches = list(re.finditer(pattern, text, re.IGNORECASE))
        if matches:
            match = matches[0]
            sentence_end = text.find('.', match.end())
            if sentence_end != -1:
                text = text[sentence_end + 1:].strip()
    else:
        pattern = r'Developments in Financial Markets and Open Market Operations'
        matches = list(re.finditer(pattern, text, re.IGNORECASE))
        if matches:
            selected_match = None
            for match in matches:
                if not is_in_footnote(text, match.start()):
                    selected_match = match
                    break
            if selected_match is None:
                selected_match = matches[0]
            sentence_end = text.find('.', selected_match.end())
            if sentence_end != -1:
                text = text[sentence_end + 1:].strip()

    end_pattern = r'meeting adjourned'
    match = re.search(end_pattern, text, re.IGNORECASE)
    if match:
        sentence_start = text.rfind('.', 0, match.start())
        if sentence_start == -1:
            sentence_start = 0
        text = text[:sentence_start].strip()

    return text.strip()

print("\nCleaning minutes...")
cleaned_texts = []

for idx, row in df.iterrows():
    if idx % 20 == 0:
        print(f"Processing minutes {idx+1}/{len(df)}...")
    date = row['date']
    text = row['text']
    cleaned_text = clean_text_minutes(text, date)
    cleaned_texts.append(cleaned_text)

df_cleaned = df.copy()
df_cleaned['text'] = cleaned_texts
df_cleaned['date'] = df_cleaned['date'].dt.strftime('%Y-%m-%d')

output_file = os.path.join(output_dir, 'fomc_minutes_cleaned.csv')
df_cleaned.to_csv(output_file, index=False)

print(f"\nCleaned minutes saved to: {output_file}")
print(f"Total minutes: {len(df_cleaned)}")

print("\n" + "="*70)
print("CLEANING STATISTICS")
print("="*70)

original_lengths = df['text'].str.len()
cleaned_lengths = df_cleaned['text'].str.len()

print(f"\nAverage original text length: {original_lengths.mean():.0f} characters")
print(f"Average cleaned text length: {cleaned_lengths.mean():.0f} characters")
print(f"Average reduction: {(original_lengths.mean() - cleaned_lengths.mean()):.0f} characters ({((1 - cleaned_lengths.mean()/original_lengths.mean())*100):.1f}%)")

print("\n" + "="*70)
print("EXAMPLES OF CLEANED TEXT")
print("="*70)

pre_2019_example = df_cleaned[df_cleaned['date'] < '2019-01-01'].iloc[-1] if len(df_cleaned[df_cleaned['date'] < '2019-01-01']) > 0 else None
if pre_2019_example is not None:
    print(f"\nPRE-2019 PERIOD")
    print(f"Date: {pre_2019_example['date']}")
    print(f"First 200 characters: {pre_2019_example['text'][:200]}...")

post_2019_example = df_cleaned[df_cleaned['date'] >= '2019-01-01'].iloc[0] if len(df_cleaned[df_cleaned['date'] >= '2019-01-01']) > 0 else None
if post_2019_example is not None:
    print(f"\n2019 ONWARD PERIOD")
    print(f"Date: {post_2019_example['date']}")
    print(f"First 200 characters: {post_2019_example['text'][:200]}...")

print("\n" + "="*70)
print("SHORTEST 5 ENTRIES (FOR INSPECTION)")
print("="*70)

df_cleaned['text_length'] = df_cleaned['text'].str.len()
shortest_5 = df_cleaned.nsmallest(5, 'text_length')[['id', 'date', 'text_length', 'text']]

for idx, row in shortest_5.iterrows():
    print(f"\n{'-'*70}")
    print(f"ID: {row['id']}")
    print(f"Date: {row['date']}")
    print(f"Length: {row['text_length']} characters")
    print(f"\nFull text:")
    print(row['text'][:1000])
    if row['text_length'] > 1000:
        print(f"\n... (truncated, {row['text_length'] - 1000} more characters)")

print("\n" + "="*70)
print("Cleaning complete!")

Reading FOMC minutes...
Loaded 199 minutes

Cleaning minutes...
Processing minutes 1/199...
Processing minutes 21/199...
Processing minutes 41/199...
Processing minutes 61/199...
Processing minutes 81/199...
Processing minutes 101/199...
Processing minutes 121/199...
Processing minutes 141/199...
Processing minutes 161/199...
Processing minutes 181/199...

Cleaned minutes saved to: /content/drive/MyDrive/FedComs/Minutes/fomc_minutes_cleaned.csv
Total minutes: 199

CLEANING STATISTICS

Average original text length: 47193 characters
Average cleaned text length: 39751 characters
Average reduction: 7442 characters (15.8%)

EXAMPLES OF CLEANED TEXT

PRE-2019 PERIOD
Date: 2018-12-19
First 200 characters: Minutes of the Federal Open Market Committee
December 18-19, 2018
A joint meeting of the Federal Open Market Committee and the Board of Governors was held in the offices of the Board of Governors of t...

2019 ONWARD PERIOD
Date: 2019-01-30
First 200 characters: S. and global financial marke

In [None]:
#@ Classify content
import pandas as pd
import numpy as np
import re
import json
import random
import time
import os

seed = int(time.time())
random.seed(seed)
np.random.seed(seed)

dict_dir = '/content/drive/MyDrive/FedComs/Dictionaries'
input_file = '/content/drive/MyDrive/FedComs/Minutes/fomc_minutes_cleaned.csv'
output_dir = '/content/drive/MyDrive/FedComs/Minutes'
validation_dir = '/content/drive/MyDrive/FedComs/Validation_Sets'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("\nLoading dictionaries...")
with open(os.path.join(dict_dir, 'labor_indicators.json'), 'r') as f:
    LABOR_INDICATORS = json.load(f)

with open(os.path.join(dict_dir, 'inflation_indicators.json'), 'r') as f:
    INFLATION_INDICATORS = json.load(f)

with open(os.path.join(dict_dir, 'inflation_pattern_mapping.json'), 'r') as f:
    INFLATION_PATTERN_TO_INDICATOR = json.load(f)

print("Dictionaries loaded successfully!")
print(f"Labor indicators: {list(LABOR_INDICATORS.keys())}")
print(f"Inflation categories: {list(INFLATION_INDICATORS.keys())}")

def fix_text_encoding(text):
    """Fix text encoding issues (if any)."""
    return text.encode('utf-8', errors='ignore').decode('utf-8')

def split_into_sentences(text):
    """Split text into sentences, preserving initials and abbreviations."""
    text = fix_text_encoding(text)

    abbreviations = [
        r'\bU\.S\.A\.', r'\bU\.S\.', r'\bU\.K\.', r'\bE\.U\.',
        r'\bSt\.', r'\bMr\.', r'\bMrs\.', r'\bMs\.', r'\bDr\.',
        r'\bProf\.', r'\bSr\.', r'\bJr\.', r'\bvs\.', r'\betc\.',
        r'\bi\.e\.', r'\be\.g\.', r'\bVol\.', r'\bNo\.', r'\bpp\.',
        r'\bCo\.', r'\bInc\.', r'\bLtd\.', r'\bCorp\.',
        r'\bPh\.D\.', r'\bM\.A\.', r'\bM\.S\.', r'\bB\.A\.',
        r'\bD\.C\.', r'\bA\.M\.', r'\bP\.M\.'
    ]

    for idx, abbr in enumerate(abbreviations):
        text = re.sub(abbr, f'<ABBR_{idx}>', text, flags=re.IGNORECASE)

    text = re.sub(r'\b([A-Z])\.(\s+[A-Z]\.)*(?=\s+[A-Z][a-z]+)', lambda m: m.group(0).replace('.', f'<NAME>'), text)
    text = re.sub(r'\b\d+\.\d+\b', lambda m: m.group(0).replace('.', '<DEC>'), text)

    voting_pattern = r'((?:Voting for|Voting against)\s+[^.!?]+?)([.!?]+\s+|$)'
    voting_matches = []
    def store_voting_match(match):
        voting_matches.append(match.group(1))
        return f'<VOTE_{len(voting_matches) - 1}>'
    text = re.sub(voting_pattern, store_voting_match, text)

    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z]|$)', text)
    sentences = [s.strip() for s in sentences if s.strip()]

    restored_sentences = []
    for sentence in sentences:
        for idx in range(len(abbreviations)):
            sentence = sentence.replace(f'<ABBR_{idx}>', abbreviations[idx].replace(r'\b', '').replace(r'\.', '.'))
        sentence = sentence.replace('<NAME>', '.')
        sentence = sentence.replace('<DEC>', '.')
        for i, voting_list in enumerate(voting_matches):
            placeholder = f'<VOTE_{i}>'
            if placeholder in sentence:
                sentence = sentence.replace(placeholder, voting_list)
        restored_sentences.append(sentence)

    return restored_sentences

def check_keywords_in_sentence(sentence, keywords):
    """Check if any keyword appears in the sentence."""
    sentence_lower = sentence.lower()
    for keyword in keywords:
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        if re.search(pattern, sentence_lower):
            return True
    return False

def check_employment_indicator(sentence, keywords):
    """Check for Employment indicator, excluding maximum/full employment."""
    sentence_lower = sentence.lower()

    # Check if sentence contains maximum employment, full employment, or employment goal
    if re.search(r'\b(?:maximum|full)\s+employment\b', sentence_lower):
        return False
    if re.search(r'\bemployment\s+goal\b', sentence_lower):
        return False

    # Otherwise check for employment keywords normally
    for keyword in keywords:
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        if re.search(pattern, sentence_lower):
            return True
    return False

def check_general_labor_term(sentence):
    """Check if sentence contains general labor terms."""
    sentence_lower = sentence.lower()
    general_labor_keywords = LABOR_INDICATORS.get("General Labor", [])
    for keyword in general_labor_keywords:
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        if re.search(pattern, sentence_lower):
            return True
    return False

def check_general_inflation_terms(sentence):
    """Check if sentence contains general inflation terms."""
    sentence_lower = sentence.lower()
    general_inflation_patterns = INFLATION_INDICATORS.get("General Inflation", {}).get("general_patterns", [])
    for pattern in general_inflation_patterns:
        if re.search(pattern, sentence_lower, re.IGNORECASE):
            return True
    return False

def check_inflation_sentence(sentence):
    """Check if sentence mentions any inflation indicator."""
    mentioned_indicators = set()
    sentence_lower = sentence.lower()

    for category, subcategories in INFLATION_INDICATORS.items():
        for pattern_name, pattern_list in subcategories.items():
            for pattern in pattern_list:
                if re.search(pattern, sentence_lower, re.IGNORECASE):
                    indicator_name = INFLATION_PATTERN_TO_INDICATOR.get(pattern_name, "Other")
                    mentioned_indicators.add(indicator_name)
                    break

    # If sentence has both Core_CPI and Core, remove the generic Core
    if "Core_CPI" in mentioned_indicators and "Core" in mentioned_indicators:
        mentioned_indicators.discard("Core")

    # If sentence has both Core_PCE and Core, remove the generic Core
    if "Core_PCE" in mentioned_indicators and "Core" in mentioned_indicators:
        mentioned_indicators.discard("Core")

    # If sentence has both Headline_CPI and Headline, remove the generic Headline
    if "Headline_CPI" in mentioned_indicators and "Headline" in mentioned_indicators:
        mentioned_indicators.discard("Headline")

    # If sentence has both Headline_PCE and Headline, remove the generic Headline
    if "Headline_PCE" in mentioned_indicators and "Headline" in mentioned_indicators:
        mentioned_indicators.discard("Headline")

    return mentioned_indicators

def classify_sentence(sentence):
    """Classify a single sentence and return its indicators."""
    labor_specific_found = False
    labor_indicators_in_sentence = set()

    # Check all labor indicators EXCEPT "General Labor"
    for indicator, keywords in LABOR_INDICATORS.items():
        if indicator == "General Labor":
            continue  # Skip general labor for indicator counts

        # Use special handling for Employment indicator
        if indicator == "Employment":
            if check_employment_indicator(sentence, keywords):
                labor_indicators_in_sentence.add(indicator)
                labor_specific_found = True
        else:
            if check_keywords_in_sentence(sentence, keywords):
                labor_indicators_in_sentence.add(indicator)
                labor_specific_found = True

    labor_general_found = check_general_labor_term(sentence)
    labor_found = labor_specific_found or labor_general_found

    inflation_indicators_in_sentence = check_inflation_sentence(sentence)
    inflation_specific_found = bool(inflation_indicators_in_sentence)

    inflation_general_found = check_general_inflation_terms(sentence)
    inflation_found = inflation_specific_found or inflation_general_found

    if labor_found and inflation_found:
        classification = "Both"
    elif labor_found:
        classification = "Labor"
    elif inflation_found:
        classification = "Inflation"
    else:
        classification = "Neither"

    return {
        'classification': classification,
        'labor_indicators': list(labor_indicators_in_sentence),
        'inflation_indicators': list(inflation_indicators_in_sentence)
    }

def analyze_minutes(text):
    """Analyze a single minutes document for labor and inflation content."""
    sentences = split_into_sentences(text)
    total_sentences = len(sentences)

    labor_sentences = 0
    inflation_sentences = 0
    both_sentences = 0

    # Only create indicator counts for non-general categories
    labor_indicator_counts = {indicator: 0 for indicator in LABOR_INDICATORS.keys() if indicator != "General Labor"}
    inflation_indicator_list = sorted(list(set(
        indicator for indicator in INFLATION_PATTERN_TO_INDICATOR.values()
        if indicator not in ["General_Inflation", "Other"]
    )))
    inflation_indicator_counts = {indicator: 0 for indicator in inflation_indicator_list}

    sentence_data_list = []

    for sent_idx, sentence in enumerate(sentences):
        classification_result = classify_sentence(sentence)

        # Filter out general categories from the indicator lists
        labor_indicators_filtered = [ind for ind in classification_result['labor_indicators']
                                      if ind != "General Labor"]
        inflation_indicators_filtered = [ind for ind in classification_result['inflation_indicators']
                                          if ind not in ["General_Inflation", "Other"]]

        sentence_data = {
            'sentence_number': sent_idx + 1,
            'sentence_text': sentence,
            'classification': classification_result['classification'],
            'labor_indicators': ', '.join(sorted(labor_indicators_filtered)) if labor_indicators_filtered else '',
            'inflation_indicators': ', '.join(sorted(inflation_indicators_filtered)) if inflation_indicators_filtered else ''
        }
        sentence_data_list.append(sentence_data)

        labor_specific_found = bool(classification_result['labor_indicators'])
        labor_general_found = check_general_labor_term(sentence)
        labor_found = labor_specific_found or labor_general_found

        inflation_specific_found = bool(classification_result['inflation_indicators'])
        inflation_general_found = check_general_inflation_terms(sentence)
        inflation_found = inflation_specific_found or inflation_general_found

        if labor_found and inflation_found:
            both_sentences += 1
            labor_sentences += 1
            inflation_sentences += 1
        elif labor_found:
            labor_sentences += 1
        elif inflation_found:
            inflation_sentences += 1

        # Only count specific indicators (not general terms or "Other") for emphasis vectors
        for indicator in classification_result['labor_indicators']:
            if indicator in labor_indicator_counts:
                labor_indicator_counts[indicator] += 1

        for indicator in classification_result['inflation_indicators']:
            if indicator in inflation_indicator_counts:
                inflation_indicator_counts[indicator] += 1

    total_labor_mentions = sum(labor_indicator_counts.values())
    total_inflation_mentions = sum(inflation_indicator_counts.values())

    labor_emphasis = {}
    for indicator, count in labor_indicator_counts.items():
        labor_emphasis[f"labor_emphasis_{indicator}"] = count / total_labor_mentions if total_labor_mentions > 0 else 0

    inflation_emphasis = {}
    for indicator, count in inflation_indicator_counts.items():
        inflation_emphasis[f"inflation_emphasis_{indicator}"] = count / total_inflation_mentions if total_inflation_mentions > 0 else 0

    labor_sentence_share = {}
    for indicator, count in labor_indicator_counts.items():
        labor_sentence_share[f"labor_share_total_sentences_{indicator}"] = count / total_sentences if total_sentences > 0 else 0

    inflation_sentence_share = {}
    for indicator, count in inflation_indicator_counts.items():
        inflation_sentence_share[f"inflation_share_total_sentences_{indicator}"] = count / total_sentences if total_sentences > 0 else 0

    labor_inflation_total = labor_sentences + inflation_sentences - both_sentences
    labor_share_of_labor_inflation = labor_sentences / labor_inflation_total if labor_inflation_total > 0 else 0

    summary_results = {
        'sentences_on_labor': labor_sentences,
        'sentences_on_inflation': inflation_sentences,
        'sentences_on_both': both_sentences,
        'total_sentences': total_sentences,
        'labor_share_of_labor_inflation_sentences': labor_share_of_labor_inflation
    }

    for indicator, count in labor_indicator_counts.items():
        summary_results[f'labor_{indicator}_count'] = count

    for indicator, count in inflation_indicator_counts.items():
        summary_results[f'inflation_{indicator}_count'] = count

    summary_results.update(labor_emphasis)
    summary_results.update(inflation_emphasis)
    summary_results.update(labor_sentence_share)
    summary_results.update(inflation_sentence_share)

    return summary_results, sentence_data_list

print("\nReading cleaned FOMC minutes...")
df = pd.read_csv(input_file)
print(f"Loaded {len(df)} minutes")

print("\nAnalyzing minutes...")
results_list = []
all_sentences = []

for idx, row in df.iterrows():
    if idx % 10 == 0:
        print(f"Processing minutes {idx+1}/{len(df)}...")

    summary_results, sentence_data_list = analyze_minutes(row['text'])
    summary_results['date'] = row['date']
    summary_results['id'] = row['id']
    results_list.append(summary_results)

    for sentence_data in sentence_data_list:
        sentence_data['minutes_date'] = row['date']
        sentence_data['minutes_id'] = row['id']
        all_sentences.append(sentence_data)

results_df = pd.DataFrame(results_list)
cols = ['id', 'date'] + [col for col in results_df.columns if col not in ['id', 'date']]
results_df = results_df[cols]
results_df = results_df.sort_values('date')

summary_output_file = os.path.join(output_dir, 'minutes_content.csv')
results_df.to_csv(summary_output_file, index=False)
print(f"\nSummary dataset saved to: {summary_output_file}")
print(f"Shape: {results_df.shape}")

sentences_df = pd.DataFrame(all_sentences)
print(f"\nTotal sentences extracted: {len(sentences_df)}")

print("\nClassification distribution:")
print(sentences_df['classification'].value_counts())

n_labor = 15
n_inflation = 15
n_both = 5
n_neither = 10

print(f"\nSampling sentences for validation...")
validation_samples = []

labor_sentences = sentences_df[sentences_df['classification'] == 'Labor']
if len(labor_sentences) >= n_labor:
    validation_samples.append(labor_sentences.sample(n=n_labor, random_state=seed))
else:
    print(f"Warning: Only {len(labor_sentences)} labor sentences available")
    validation_samples.append(labor_sentences)

inflation_sentences = sentences_df[sentences_df['classification'] == 'Inflation']
if len(inflation_sentences) >= n_inflation:
    validation_samples.append(inflation_sentences.sample(n=n_inflation, random_state=seed))
else:
    print(f"Warning: Only {len(inflation_sentences)} inflation sentences available")
    validation_samples.append(inflation_sentences)

both_sentences = sentences_df[sentences_df['classification'] == 'Both']
if len(both_sentences) >= n_both:
    validation_samples.append(both_sentences.sample(n=n_both, random_state=seed))
else:
    print(f"Warning: Only {len(both_sentences)} both sentences available")
    validation_samples.append(both_sentences)

neither_sentences = sentences_df[sentences_df['classification'] == 'Neither']
if len(neither_sentences) >= n_neither:
    validation_samples.append(neither_sentences.sample(n=n_neither, random_state=seed))
else:
    print(f"Warning: Only {len(neither_sentences)} neither sentences available")
    validation_samples.append(neither_sentences)

validation_df = pd.concat(validation_samples, ignore_index=True)
validation_df = validation_df.sample(frac=1, random_state=seed).reset_index(drop=True)

validation_output_file = os.path.join(validation_dir, 'minutes_validate.csv')
validation_df.to_csv(validation_output_file, index=False)

print(f"\nValidation set created: {validation_output_file}")
print(f"Total sentences in validation set: {len(validation_df)}")
print(f"\nValidation set distribution:")
print(validation_df['classification'].value_counts())

results_df['date'] = pd.to_datetime(results_df['date'])
df_2010_plus = results_df[results_df['date'] >= '2010-01-01'].copy()

print("\n" + "="*70)
print("SUMMARY STATISTICS (2010-CURRENT)")
print("="*70)
print(f"\nNumber of minutes: {len(df_2010_plus)}")
print(f"Date range: {df_2010_plus['date'].min().strftime('%Y-%m-%d')} to {df_2010_plus['date'].max().strftime('%Y-%m-%d')}")
print(f"\nAverage sentences per minutes: {df_2010_plus['total_sentences'].mean():.1f}")
print(f"Average labor sentences: {df_2010_plus['sentences_on_labor'].mean():.1f}")
print(f"Average inflation sentences: {df_2010_plus['sentences_on_inflation'].mean():.1f}")
print(f"Average sentences on both: {df_2010_plus['sentences_on_both'].mean():.1f}")
print(f"Average labor share of labor/inflation: {df_2010_plus['labor_share_of_labor_inflation_sentences'].mean():.2%}")

labor_emphasis_cols = [col for col in results_df.columns if col.startswith('labor_emphasis_')]
print("\n" + "-"*70)
print("AVERAGE LABOR EMPHASIS VECTORS (2010-CURRENT)")
print("-"*70)
for col in sorted(labor_emphasis_cols):
    indicator_name = col.replace('labor_emphasis_', '')
    avg_emphasis = df_2010_plus[col].mean()
    print(f"{indicator_name:20s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")

total_labor_emphasis = df_2010_plus[labor_emphasis_cols].mean().sum()
print(f"\n{'Total':20s}: {total_labor_emphasis:.4f}")

inflation_emphasis_cols = [col for col in results_df.columns if col.startswith('inflation_emphasis_')]
print("\n" + "-"*70)
print("AVERAGE INFLATION EMPHASIS VECTORS (2010-CURRENT)")
print("-"*70)
for col in sorted(inflation_emphasis_cols):
    indicator_name = col.replace('inflation_emphasis_', '')
    avg_emphasis = df_2010_plus[col].mean()
    print(f"{indicator_name:20s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")

total_inflation_emphasis = df_2010_plus[inflation_emphasis_cols].mean().sum()
print(f"\n{'Total':20s}: {total_inflation_emphasis:.4f}")

print("\n" + "="*70)
print("VALIDATION SET SUMMARY")
print("="*70)
summary_stats = {
    'total_sentences_in_corpus': len(sentences_df),
    'validation_set_size': len(validation_df),
    'labor_only_count': len(validation_df[validation_df['classification'] == 'Labor']),
    'inflation_only_count': len(validation_df[validation_df['classification'] == 'Inflation']),
    'both_count': len(validation_df[validation_df['classification'] == 'Both']),
    'neither_count': len(validation_df[validation_df['classification'] == 'Neither'])
}
for key, value in summary_stats.items():
    print(f"{key}: {value}")

print("\n" + "="*70)
print("SAMPLE OF VALIDATION SET (First 10 rows)")
print("="*70)
print(validation_df.head(10).to_string(index=True, max_colwidth=100))


Loading dictionaries...
Dictionaries loaded successfully!
Labor indicators: ['General Labor', 'Employment', 'Unemployment', 'Participation', 'Wages', 'Vacancies', 'Quits', 'Layoffs', 'Hiring']
Inflation categories: ['General Inflation', 'Core Measures', 'Headline Measures', 'Sectoral Measures', 'Producer Price Index', 'Wage Inflation', 'Inflation Expectations', 'Commodity Prices']

Reading cleaned FOMC minutes...
Loaded 199 minutes

Analyzing minutes...
Processing minutes 1/199...
Processing minutes 11/199...
Processing minutes 21/199...
Processing minutes 31/199...
Processing minutes 41/199...
Processing minutes 51/199...
Processing minutes 61/199...
Processing minutes 71/199...
Processing minutes 81/199...
Processing minutes 91/199...
Processing minutes 101/199...
Processing minutes 111/199...
Processing minutes 121/199...
Processing minutes 131/199...
Processing minutes 141/199...
Processing minutes 151/199...
Processing minutes 161/199...
Processing minutes 171/199...
Processing m