In [20]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import os
import pandas as pd
import numpy as np
import re
import json
import random
import time
from glob import glob

# Set random seed for reproducibility
seed = int(time.time())
random.seed(seed)
np.random.seed(seed)

# Directory paths
dict_dir = '/content/drive/MyDrive/FedComs/Dictionaries'
input_dir = '/content/drive/MyDrive/FedComs/Transcripts/cleaned_transcripts'
cleaned_output_dir = '/content/drive/MyDrive/FedComs/Transcripts/final_transcripts'
validation_output_dir = '/content/drive/MyDrive/FedComs/Validation_Sets'
summary_output_dir = '/content/drive/MyDrive/FedComs/Transcripts'

# Create output directories if they don't exist
for directory in [cleaned_output_dir, validation_output_dir, summary_output_dir]:
    if not os.path.exists(directory):
        os.makedirs(directory)

os.chdir(summary_output_dir)
print(f"Current working directory: {os.getcwd()}")

# ============================================================================
# STEP 1: CLEAN TRANSCRIPTS
# ============================================================================

def clean_page_numbers(text):
    """Remove page number patterns from transcript text."""
    # Pattern 1: "April 26–27, 2011 52 of 244" or "April 26-27, 2011 52 of 244"
    # Handles both en-dash (–), em-dash (—), and regular dash (-)
    text = re.sub(
        r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+'
        r'\d{1,2}(?:[-–—]\d{1,2})?,\s+\d{4}\s+\d+\s+of\s+\d+',
        '',
        text
    )

    # Pattern 2: Date ranges that may be incomplete/cut off
    # "April 30–May 1" or "April 30–" or similar with various dash types
    text = re.sub(
        r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+'
        r'\d{1,2}[-–—]+'
        r'(?:(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2})?',
        '',
        text
    )

    # Pattern 3: Standalone page numbers like "52 of 244" that might be on their own line
    text = re.sub(r'\n\s*\d+\s+of\s+\d+\s*\n', '\n', text)

    # Pattern 4: Date formats with forward slashes: "6/26-27/01 87"
    text = re.sub(
        r'\d{1,2}/\d{1,2}(?:[-–—]\d{1,2})?/\d{2,4}\s+\d+',
        '',
        text
    )

    return text

def fix_text_encoding(text):
    """Fix common text encoding issues."""
    text = text.replace('â€"', '—')
    text = text.replace('â€"', '—')
    text = text.replace('â€œ', '"')
    text = text.replace('â€', '"')
    text = text.replace('\u2013', '–')
    text = text.replace('\u2014', '—')
    text = text.replace('\u2018', "'")
    text = text.replace('\u2019', "'")
    text = text.replace('\u201c', '"')
    text = text.replace('\u201d', '"')
    text = text.replace('\u2026', '...')
    text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]', '', text)
    return text

def clean_transcript_text(text):
    """Clean transcript text by fixing encoding and removing page numbers."""
    text = fix_text_encoding(text)
    text = clean_page_numbers(text)

    # Clean up excessive whitespace
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple newlines to double
    text = re.sub(r' +', ' ', text)  # Multiple spaces to single

    return text.strip()

print("\nCleaning transcript files...")
print(f"Reading from: {input_dir}")

# Get all CSV files in the input directory
csv_files = glob(os.path.join(input_dir, '*.csv'))
print(f"Found {len(csv_files)} transcript files")

if len(csv_files) == 0:
    print("ERROR: No CSV files found in input directory!")
    print(f"Please check that files exist in: {input_dir}")
else:
    # Process each file
    cleaned_count = 0
    for csv_file in csv_files:
        filename = os.path.basename(csv_file)
        print(f"Processing {filename}...")

        try:
            # Read the transcript file
            df = pd.read_csv(csv_file)

            if 'Text' not in df.columns:
                print(f"  Warning: No 'Text' column found in {filename}, skipping...")
                continue

            # Clean the text
            df['Text'] = df['Text'].apply(lambda x: clean_transcript_text(str(x)) if pd.notna(x) else '')

            # Save cleaned version
            output_file = os.path.join(cleaned_output_dir, filename.replace('cleaned_', ''))
            df.to_csv(output_file, index=False)
            cleaned_count += 1

        except Exception as e:
            print(f"  Error processing {filename}: {e}")
            continue

    print(f"\nCleaned {cleaned_count} transcript files")
    print(f"Cleaned files saved to: {cleaned_output_dir}")

# ============================================================================
# STEP 2: CLASSIFY CONTENT
# ============================================================================

print("\n" + "="*70)
print("LOADING DICTIONARIES")
print("="*70)

with open(os.path.join(dict_dir, 'labor_indicators.json'), 'r') as f:
    LABOR_INDICATORS = json.load(f)

with open(os.path.join(dict_dir, 'inflation_indicators.json'), 'r') as f:
    INFLATION_INDICATORS = json.load(f)

with open(os.path.join(dict_dir, 'inflation_pattern_mapping.json'), 'r') as f:
    INFLATION_PATTERN_TO_INDICATOR = json.load(f)

print("Dictionaries loaded successfully!")
print(f"Labor indicators: {list(LABOR_INDICATORS.keys())}")
print(f"Inflation categories: {list(INFLATION_INDICATORS.keys())}")

def split_into_sentences(text):
    """Split text into sentences, preserving initials and abbreviations."""
    text = fix_text_encoding(text)

    abbreviations = [
        r'\bU\.S\.A\.', r'\bU\.S\.', r'\bU\.K\.', r'\bE\.U\.',
        r'\bSt\.', r'\bMr\.', r'\bMrs\.', r'\bMs\.', r'\bDr\.',
        r'\bProf\.', r'\bSr\.', r'\bJr\.', r'\bvs\.', r'\betc\.',
        r'\bi\.e\.', r'\be\.g\.', r'\bVol\.', r'\bNo\.', r'\bpp\.',
        r'\bCo\.', r'\bInc\.', r'\bLtd\.', r'\bCorp\.',
        r'\bPh\.D\.', r'\bM\.A\.', r'\bM\.S\.', r'\bB\.A\.',
        r'\bD\.C\.', r'\bA\.M\.', r'\bP\.M\.'
    ]

    for idx, abbr in enumerate(abbreviations):
        text = re.sub(abbr, f'<ABBR_{idx}>', text, flags=re.IGNORECASE)

    text = re.sub(r'\b([A-Z])\.(\s+[A-Z]\.)*(?=\s+[A-Z][a-z]+)', lambda m: m.group(0).replace('.', f'<NAME>'), text)
    text = re.sub(r'\b\d+\.\d+\b', lambda m: m.group(0).replace('.', '<DEC>'), text)

    voting_pattern = r'((?:Voting for|Voting against)\s+[^.!?]+?)([.!?]+\s+|$)'
    voting_matches = []
    def store_voting_match(match):
        voting_matches.append(match.group(1))
        return f'<VOTE_{len(voting_matches) - 1}>'
    text = re.sub(voting_pattern, store_voting_match, text)

    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z]|$)', text)
    sentences = [s.strip() for s in sentences if s.strip()]

    restored_sentences = []
    for sentence in sentences:
        for idx in range(len(abbreviations)):
            sentence = sentence.replace(f'<ABBR_{idx}>', abbreviations[idx].replace(r'\b', '').replace(r'\.', '.'))
        sentence = sentence.replace('<NAME>', '.')
        sentence = sentence.replace('<DEC>', '.')
        for i, voting_list in enumerate(voting_matches):
            placeholder = f'<VOTE_{i}>'
            if placeholder in sentence:
                sentence = sentence.replace(placeholder, voting_list)
        restored_sentences.append(sentence)

    return restored_sentences

def check_keywords_in_sentence(sentence, keywords):
    """Check if any keyword appears in the sentence."""
    sentence_lower = sentence.lower()
    for keyword in keywords:
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        if re.search(pattern, sentence_lower):
            return True
    return False

def check_employment_indicator(sentence, keywords):
    """Check for Employment indicator, excluding maximum/full employment."""
    sentence_lower = sentence.lower()

    # Check if sentence contains maximum employment, full employment, or employment goal
    if re.search(r'\b(?:maximum|full)\s+employment\b', sentence_lower):
        return False
    if re.search(r'\bemployment\s+goal\b', sentence_lower):
        return False

    # Otherwise check for employment keywords normally
    for keyword in keywords:
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        if re.search(pattern, sentence_lower):
            return True
    return False

def check_general_labor_term(sentence):
    """Check if sentence contains general labor terms (from General Labor category)."""
    sentence_lower = sentence.lower()
    # Use the General Labor keywords from the dictionary
    general_labor_keywords = LABOR_INDICATORS.get("General Labor", [])
    for keyword in general_labor_keywords:
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        if re.search(pattern, sentence_lower):
            return True
    return False

def check_general_inflation_terms(sentence):
    """Check if sentence contains general inflation terms (from General Inflation category)."""
    sentence_lower = sentence.lower()
    # Use the General Inflation patterns from the dictionary
    general_inflation_patterns = INFLATION_INDICATORS.get("General Inflation", {}).get("general_patterns", [])
    for pattern in general_inflation_patterns:
        if re.search(pattern, sentence_lower, re.IGNORECASE):
            return True
    return False

def check_inflation_sentence(sentence):
    """Check if sentence mentions any inflation indicator."""
    mentioned_indicators = set()
    sentence_lower = sentence.lower()

    for category, subcategories in INFLATION_INDICATORS.items():
        for pattern_name, pattern_list in subcategories.items():
            for pattern in pattern_list:
                if re.search(pattern, sentence_lower, re.IGNORECASE):
                    indicator_name = INFLATION_PATTERN_TO_INDICATOR.get(pattern_name, "Other")
                    mentioned_indicators.add(indicator_name)
                    break

    # If sentence has both Core_CPI and Core, remove the generic Core
    if "Core_CPI" in mentioned_indicators and "Core" in mentioned_indicators:
        mentioned_indicators.discard("Core")

    # If sentence has both Core_PCE and Core, remove the generic Core
    if "Core_PCE" in mentioned_indicators and "Core" in mentioned_indicators:
        mentioned_indicators.discard("Core")

    # If sentence has both Headline_CPI and Headline, remove the generic Headline
    if "Headline_CPI" in mentioned_indicators and "Headline" in mentioned_indicators:
        mentioned_indicators.discard("Headline")

    # If sentence has both Headline_PCE and Headline, remove the generic Headline
    if "Headline_PCE" in mentioned_indicators and "Headline" in mentioned_indicators:
        mentioned_indicators.discard("Headline")

    return mentioned_indicators

def classify_sentence(sentence):
    """Classify a single sentence and return its indicators."""
    labor_specific_found = False
    labor_indicators_in_sentence = set()

    # Check all labor indicators EXCEPT "General Labor"
    for indicator, keywords in LABOR_INDICATORS.items():
        if indicator == "General Labor":
            continue  # Skip general labor for indicator counts

        # Use special handling for Employment indicator
        if indicator == "Employment":
            if check_employment_indicator(sentence, keywords):
                labor_indicators_in_sentence.add(indicator)
                labor_specific_found = True
        else:
            if check_keywords_in_sentence(sentence, keywords):
                labor_indicators_in_sentence.add(indicator)
                labor_specific_found = True

    labor_general_found = check_general_labor_term(sentence)
    labor_found = labor_specific_found or labor_general_found

    inflation_indicators_in_sentence = check_inflation_sentence(sentence)
    inflation_specific_found = bool(inflation_indicators_in_sentence)

    inflation_general_found = check_general_inflation_terms(sentence)
    inflation_found = inflation_specific_found or inflation_general_found

    if labor_found and inflation_found:
        classification = "Both"
    elif labor_found:
        classification = "Labor"
    elif inflation_found:
        classification = "Inflation"
    else:
        classification = "Neither"

    return {
        'classification': classification,
        'labor_indicators': list(labor_indicators_in_sentence),
        'inflation_indicators': list(inflation_indicators_in_sentence)
    }

def analyze_transcript(text):
    """Analyze a single transcript for labor and inflation content."""
    sentences = split_into_sentences(text)
    total_sentences = len(sentences)

    labor_sentences = 0
    inflation_sentences = 0
    both_sentences = 0

    # Only create indicator counts for non-general categories
    labor_indicator_counts = {indicator: 0 for indicator in LABOR_INDICATORS.keys() if indicator != "General Labor"}
    inflation_indicator_list = sorted(list(set(
        indicator for indicator in INFLATION_PATTERN_TO_INDICATOR.values()
        if indicator not in ["General_Inflation", "Other"]
    )))
    inflation_indicator_counts = {indicator: 0 for indicator in inflation_indicator_list}

    sentence_data_list = []

    for sent_idx, sentence in enumerate(sentences):
        classification_result = classify_sentence(sentence)

        # Filter out general categories from the indicator lists
        labor_indicators_filtered = [ind for ind in classification_result['labor_indicators']
                                      if ind != "General Labor"]
        inflation_indicators_filtered = [ind for ind in classification_result['inflation_indicators']
                                          if ind not in ["General_Inflation", "Other"]]

        sentence_data = {
            'sentence_number': sent_idx + 1,
            'sentence_text': sentence,
            'classification': classification_result['classification'],
            'labor_indicators': ', '.join(sorted(labor_indicators_filtered)) if labor_indicators_filtered else '',
            'inflation_indicators': ', '.join(sorted(inflation_indicators_filtered)) if inflation_indicators_filtered else ''
        }
        sentence_data_list.append(sentence_data)

        labor_specific_found = bool(classification_result['labor_indicators'])
        labor_general_found = check_general_labor_term(sentence)
        labor_found = labor_specific_found or labor_general_found

        inflation_specific_found = bool(classification_result['inflation_indicators'])
        inflation_general_found = check_general_inflation_terms(sentence)
        inflation_found = inflation_specific_found or inflation_general_found

        if labor_found and inflation_found:
            both_sentences += 1
            labor_sentences += 1
            inflation_sentences += 1
        elif labor_found:
            labor_sentences += 1
        elif inflation_found:
            inflation_sentences += 1

        # Only count specific indicators (not general terms or "Other") for emphasis vectors
        for indicator in classification_result['labor_indicators']:
            if indicator in labor_indicator_counts:
                labor_indicator_counts[indicator] += 1

        for indicator in classification_result['inflation_indicators']:
            if indicator in inflation_indicator_counts:
                inflation_indicator_counts[indicator] += 1

    total_labor_mentions = sum(labor_indicator_counts.values())
    total_inflation_mentions = sum(inflation_indicator_counts.values())

    labor_emphasis = {}
    for indicator, count in labor_indicator_counts.items():
        labor_emphasis[f"labor_emphasis_{indicator}"] = count / total_labor_mentions if total_labor_mentions > 0 else 0

    inflation_emphasis = {}
    for indicator, count in inflation_indicator_counts.items():
        inflation_emphasis[f"inflation_emphasis_{indicator}"] = count / total_inflation_mentions if total_inflation_mentions > 0 else 0

    labor_sentence_share = {}
    for indicator, count in labor_indicator_counts.items():
        labor_sentence_share[f"labor_share_total_sentences_{indicator}"] = count / total_sentences if total_sentences > 0 else 0

    inflation_sentence_share = {}
    for indicator, count in inflation_indicator_counts.items():
        inflation_sentence_share[f"inflation_share_total_sentences_{indicator}"] = count / total_sentences if total_sentences > 0 else 0

    labor_inflation_total = labor_sentences + inflation_sentences - both_sentences
    labor_share_of_labor_inflation = labor_sentences / labor_inflation_total if labor_inflation_total > 0 else 0

    summary_results = {
        'sentences_on_labor': labor_sentences,
        'sentences_on_inflation': inflation_sentences,
        'sentences_on_both': both_sentences,
        'total_sentences': total_sentences,
        'labor_share_of_labor_inflation_sentences': labor_share_of_labor_inflation
    }

    for indicator, count in labor_indicator_counts.items():
        summary_results[f'labor_{indicator}_count'] = count

    for indicator, count in inflation_indicator_counts.items():
        summary_results[f'inflation_{indicator}_count'] = count

    summary_results.update(labor_emphasis)
    summary_results.update(inflation_emphasis)
    summary_results.update(labor_sentence_share)
    summary_results.update(inflation_sentence_share)

    return summary_results, sentence_data_list

print("\n" + "="*70)
print("CLASSIFYING TRANSCRIPT CONTENT")
print("="*70)

# Get all cleaned transcript files
cleaned_files = glob(os.path.join(cleaned_output_dir, '*.csv'))
print(f"Found {len(cleaned_files)} cleaned transcript files to classify")

results_list = []
all_sentences = []

for idx, csv_file in enumerate(cleaned_files):
    filename = os.path.basename(csv_file)
    official_name = filename.replace('.csv', '').replace('_', ' ')

    if idx % 5 == 0:
        print(f"Processing file {idx+1}/{len(cleaned_files)}: {filename}")

    try:
        # Read CSV without any string processing that could trigger regex
        df = pd.read_csv(csv_file, encoding='utf-8', encoding_errors='replace')

        if 'Text' not in df.columns:
            print(f"  Warning: No 'Text' column in {filename}, skipping...")
            continue

        # Process each row (each speech/statement)
        for row_idx, row in df.iterrows():
            # Convert to string safely without regex
            if pd.isna(row['Text']):
                text = ''
            else:
                text = str(row['Text'])

            if len(text.strip()) == 0:
                continue

            summary_results, sentence_data_list = analyze_transcript(text)

            # Add metadata from the transcript - safely get values
            summary_results['transcript_id'] = str(row.get('id', f"{official_name}_{row_idx}")) if 'id' in row else f"{official_name}_{row_idx}"
            summary_results['date'] = str(row.get('Date', '')) if 'Date' in row else ''
            summary_results['official_name'] = str(row.get('Name', official_name)) if 'Name' in row else official_name
            summary_results['role'] = str(row.get('Role', '')) if 'Role' in row else ''

            results_list.append(summary_results)

            # Add sentence-level data
            for sentence_data in sentence_data_list:
                sentence_data['transcript_id'] = summary_results['transcript_id']
                sentence_data['date'] = summary_results['date']
                sentence_data['official_name'] = summary_results['official_name']
                all_sentences.append(sentence_data)

    except Exception as e:
        print(f"  Error processing {filename}: {e}")
        import traceback
        print(f"  Full traceback: {traceback.format_exc()}")
        continue

# Create summary dataframe
results_df = pd.DataFrame(results_list)

if len(results_df) > 0:
    cols = ['transcript_id', 'date', 'official_name', 'role'] + [col for col in results_df.columns if col not in ['transcript_id', 'date', 'official_name', 'role']]
    results_df = results_df[[col for col in cols if col in results_df.columns]]
    results_df = results_df.sort_values('date')

    summary_output_file = os.path.join(summary_output_dir, 'transcripts_content.csv')
    results_df.to_csv(summary_output_file, index=False)
    print(f"\nSummary dataset saved to: {summary_output_file}")
    print(f"Shape: {results_df.shape}")
else:
    print("\nWarning: No results to save!")

# Create sentence-level dataframe
sentences_df = pd.DataFrame(all_sentences)

if len(sentences_df) > 0:
    print(f"\nTotal sentences extracted: {len(sentences_df)}")
    print("\nClassification distribution:")
    print(sentences_df['classification'].value_counts())

    # Sample sentences for validation
    n_labor = 15
    n_inflation = 15
    n_both = 5
    n_neither = 10

    print(f"\nSampling sentences for validation...")
    validation_samples = []

    labor_sentences = sentences_df[sentences_df['classification'] == 'Labor']
    if len(labor_sentences) >= n_labor:
        validation_samples.append(labor_sentences.sample(n=n_labor, random_state=seed))
    else:
        print(f"Warning: Only {len(labor_sentences)} labor sentences available")
        validation_samples.append(labor_sentences)

    inflation_sentences = sentences_df[sentences_df['classification'] == 'Inflation']
    if len(inflation_sentences) >= n_inflation:
        validation_samples.append(inflation_sentences.sample(n=n_inflation, random_state=seed))
    else:
        print(f"Warning: Only {len(inflation_sentences)} inflation sentences available")
        validation_samples.append(inflation_sentences)

    both_sentences = sentences_df[sentences_df['classification'] == 'Both']
    if len(both_sentences) >= n_both:
        validation_samples.append(both_sentences.sample(n=n_both, random_state=seed))
    else:
        print(f"Warning: Only {len(both_sentences)} both sentences available")
        validation_samples.append(both_sentences)

    neither_sentences = sentences_df[sentences_df['classification'] == 'Neither']
    if len(neither_sentences) >= n_neither:
        validation_samples.append(neither_sentences.sample(n=n_neither, random_state=seed))
    else:
        print(f"Warning: Only {len(neither_sentences)} neither sentences available")
        validation_samples.append(neither_sentences)

    validation_df = pd.concat(validation_samples, ignore_index=True)
    validation_df = validation_df.sample(frac=1, random_state=seed).reset_index(drop=True)

    validation_output_file = os.path.join(validation_output_dir, 'transcripts_validate.csv')
    validation_df.to_csv(validation_output_file, index=False)

    print(f"\nValidation set created: {validation_output_file}")
    print(f"Total sentences in validation set: {len(validation_df)}")
    print(f"\nValidation set distribution:")
    print(validation_df['classification'].value_counts())

# Print summary statistics
if len(results_df) > 0:
    print("\n" + "="*70)
    print("SUMMARY STATISTICS")
    print("="*70)
    print(f"\nNumber of transcripts analyzed: {len(results_df)}")
    print(f"\nAverage sentences per transcript: {results_df['total_sentences'].mean():.1f}")
    print(f"Average labor sentences: {results_df['sentences_on_labor'].mean():.1f}")
    print(f"Average inflation sentences: {results_df['sentences_on_inflation'].mean():.1f}")
    print(f"Average sentences on both: {results_df['sentences_on_both'].mean():.1f}")
    print(f"Average labor share of labor/inflation: {results_df['labor_share_of_labor_inflation_sentences'].mean():.2%}")

    # Labor emphasis breakdown
    labor_emphasis_cols = [col for col in results_df.columns if col.startswith('labor_emphasis_')]
    print("\n" + "-"*70)
    print("AVERAGE LABOR EMPHASIS VECTORS")
    print("-"*70)
    for col in sorted(labor_emphasis_cols):
        indicator_name = col.replace('labor_emphasis_', '')
        avg_emphasis = results_df[col].mean()
        print(f"{indicator_name:20s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")

    total_labor_emphasis = results_df[labor_emphasis_cols].mean().sum()
    print(f"\n{'Total':20s}: {total_labor_emphasis:.4f}")

    # Inflation emphasis breakdown
    inflation_emphasis_cols = [col for col in results_df.columns if col.startswith('inflation_emphasis_')]
    print("\n" + "-"*70)
    print("AVERAGE INFLATION EMPHASIS VECTORS")
    print("-"*70)
    for col in sorted(inflation_emphasis_cols):
        indicator_name = col.replace('inflation_emphasis_', '')
        avg_emphasis = results_df[col].mean()
        print(f"{indicator_name:20s}: {avg_emphasis:.4f} ({avg_emphasis*100:.2f}%)")

    total_inflation_emphasis = results_df[inflation_emphasis_cols].mean().sum()
    print(f"\n{'Total':20s}: {total_inflation_emphasis:.4f}")

    # Officials with highest and lowest labor share
    print("\n" + "-"*70)
    print("OFFICIALS BY LABOR SHARE OF LABOR/INFLATION")
    print("-"*70)

    # Calculate average per official
    official_labor_share = results_df.groupby('official_name')['labor_share_of_labor_inflation_sentences'].agg(['mean', 'count'])
    official_labor_share = official_labor_share[official_labor_share['count'] >= 3]  # At least 3 transcripts
    official_labor_share = official_labor_share.sort_values('mean', ascending=False)

    print("\nTop 10 Officials (Highest Labor Share):")
    for idx, (official, row) in enumerate(official_labor_share.head(10).iterrows(), 1):
        print(f"{idx:2d}. {official:30s}: {row['mean']:.2%} (n={int(row['count'])})")

    print("\nBottom 10 Officials (Lowest Labor Share):")
    for idx, (official, row) in enumerate(official_labor_share.tail(10).iterrows(), 1):
        print(f"{idx:2d}. {official:30s}: {row['mean']:.2%} (n={int(row['count'])})")

print("\n" + "="*70)
print("PROCESSING COMPLETE!")
print("="*70)

Mounted at /content/drive
Current working directory: /content/drive/MyDrive/FedComs/Transcripts

Cleaning transcript files...
Reading from: /content/drive/MyDrive/FedComs/Transcripts/cleaned_transcripts
Found 53 transcript files
Processing Alan_Greenspan.csv...
Processing Roger_W_Ferguson.csv...
Processing William_McDonough.csv...
Processing Alfred_Broaddus.csv...
Processing Edward_G_Boehne.csv...
Processing Jerry_Jordan.csv...
Processing Robert_T_Parry.csv...
Processing Gary_H_Stern.csv...
Processing William_Poole.csv...
Processing Thomas_M_Hoenig.csv...
Processing Robert_D_McTeer.csv...
Processing Michael_Moskow.csv...
Processing Jack_Guynn.csv...
Processing Edward_M_Gramlich.csv...
Processing Anthony_M_Santomero.csv...
Processing Susan_S_Bies.csv...
Processing Mark_W_Olson.csv...
Processing Donald_Kohn.csv...
Processing Sandra_Pianalto.csv...
Processing Timothy_Geithner.csv...
Processing Jeffrey_M_Lacker.csv...
Processing Richard_W_Fisher.csv...
Processing Ben_Bernanke.csv...
Proces