In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import os
output_dir = '/content/drive/MyDrive/Statements'

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

os.chdir(output_dir)

# Verify the current working directory
print(f"Current working directory: {os.getcwd()}")

# Configuration
SPEECHES_DIR = '/content/drive/MyDrive/FedComs/Speeches'
OUTPUT_DIR = '/content/drive/MyDrive/FedComs/Speeches/Analysis'

MessageError: Error: credential propagation was unsuccessful

In [5]:
# CREATE INFLATION INDICATOR EMPHASIS VECTORS
import os
import pandas as pd
import re
import numpy as np
import glob
from collections import Counter

# Inflation indicators dictionary
robust_inflation_indicators = {
    "Core Measures": {
        # Core CPI patterns
        "core_cpi_patterns": [
            r"\b(core|underlying)\s+(cpi|consumer\s+price\s+index)\b",
            r"\b(cpi|consumer\s+price\s+index)\s+(core|underlying)\b",
            r"\bcore\s+consumer\s+price\s+inflation\b",
            r"\bcore\s+inflation\b"
        ],

        # Core PCE patterns
        "core_pce_patterns": [
            r"\b(core|underlying)\s+(pce|personal\s+consumption\s+expenditures?)\b",
            r"\b(pce|personal\s+consumption\s+expenditures?)\s+(core|underlying)\b",
            r"\bcore\s+personal\s+consumption\b"
        ],

        # Excluding patterns (food/energy)
        "excluding_patterns": [
            r"\b(cpi|pce|consumer\s+price|personal\s+consumption|inflation|prices?)\s+(excluding|ex|minus|without|less)\s+(food|energy|volatile)\b",
            r"\b(excluding|ex|minus|without|less)\s+(food|energy|volatile)\s+(cpi|pce|consumer\s+price|inflation|prices?)\b",
            r"\b(cpi|pce|inflation|prices?)\s+(excluding|ex|minus|without|less)\s+(food\s+and\s+energy|volatile\s+items)\b",
            r"\b(stripping\s+out|stripped\s+of)\s+(food|energy|volatile)\b",
            r"\bex[\-\s]food\s+and\s+energy\b",
            r"\bexcluding\s+volatile\s+(food\s+and\s+)?energy\b"
        ],

        # Trimmed mean patterns
        "trimmed_patterns": [
            r"\b(trimmed|weighted)\s+(mean|median|average)\s+(cpi|pce|inflation|prices?)\b",
            r"\b(cpi|pce|inflation|prices?)\s+(trimmed|weighted)\s+(mean|median|average)\b",
            r"\b(trimmed-mean|weighted-median)\b",
            r"\btrimmed\s+mean\b"
        ],

        # Excluding housing patterns
        "excluding_housing_patterns": [
            r"\b(services?|cpi|pce|inflation|prices?)\s+(excluding|ex|minus|without|less)\s+(housing|shelter|rent)\b",
            r"\b(excluding|ex|minus|without|less)\s+(housing|shelter|rent)\s+(services?|inflation|prices?)\b",
            r"\bex[\-\s]housing\b",
            r"\bexcluding\s+housing\b"
        ],

        # Excluding energy patterns
        "excluding_energy_patterns": [
            r"\b(cpi|pce|inflation|prices?)\s+(excluding|ex|minus|without|less)\s+energy\b",
            r"\b(excluding|ex|minus|without|less)\s+energy\s+(cpi|pce|inflation|prices?)\b",
            r"\bex[\-\s]energy\b",
            r"\bexcluding\s+energy\b"
        ],

        # Supercore patterns
        "supercore_patterns": [
            r"\bsupercore\b",
            r"\bsuper\s+core\b",
            r"\bservices?\s+(excluding|ex|minus|without)\s+(housing|shelter)\b",
            r"\bcore\s+services?\s+(excluding|ex|minus|without)\s+(housing|shelter)\b",
            r"\bcore\s+services\s+ex[\-\s]housing\b"
        ]
    },

    "Headline Measures": {
        "headline_cpi_patterns": [
            r"\b(headline|overall|total|all-items?)\s+(cpi|consumer\s+price\s+index|consumer\s+prices?)\b",
            r"\b(cpi|consumer\s+price\s+index|consumer\s+prices?)\s+(headline|overall|total)\b",
            r"\bcpi\b(?!\s+(core|excluding|ex|minus|without|trimmed))",
            r"\bconsumer\s+price\s+index\b(?!\s+(core|excluding|ex|minus|without))",
            r"\ball\s+items\s+cpi\b",
            r"\btotal\s+cpi\b"
        ],

        "headline_pce_patterns": [
            r"\b(headline|overall|total|all-items?)\s+(pce|personal\s+consumption\s+expenditures?)\b",
            r"\b(pce|personal\s+consumption\s+expenditures?)\s+(headline|overall|total)\b",
            r"\bpce\b(?!\s+(core|excluding|ex|minus|without))",
            r"\bpersonal\s+consumption\s+expenditures?\b(?!\s+(core|excluding|ex|minus|without))",
            r"\ball\s+items\s+pce\b",
            r"\btotal\s+pce\b"
        ],

        "general_headline_patterns": [
            r"\bheadline\s+inflation\b",
            r"\boverall\s+inflation\b",
            r"\btotal\s+inflation\b",
            r"\ball[\-\s]items?\s+inflation\b"
        ]
    },

    "Sectoral Measures": {
        # Services patterns
        "services_patterns": [
            r"\bservices?\s+(inflation|prices?|costs?|pricing)\b",
            r"\b(inflation|prices?|costs?|pricing)\s+(in|of|for|on)\s+services?\b",
            r"\b(price|cost|pricing)\s+of\s+services?\b",
            r"\bservice\s+(sector\s+)?inflation\b",
            r"\bservices\s+component\b",
            r"\b(recreation|transportation|medical|professional|financial|business|personal)\s+services?\s+(inflation|prices?|costs?)\b",
            r"\b(inflation|prices?|costs?)\s+(in|of|for)\s+(recreation|transportation|medical|professional|financial|business|personal)\s+services?\b"
        ],

        # Goods patterns
        "goods_patterns": [
            r"\b(consumer\s+)?goods\s+(inflation|prices?|costs?|pricing)\b",
            r"\b(inflation|prices?|costs?|pricing)\s+(in|of|for|on)\s+(consumer\s+)?goods\b",
            r"\b(price|cost|pricing)\s+of\s+(consumer\s+)?goods\b",
            r"\bgoods\s+(sector\s+)?inflation\b",
            r"\bgoods\s+component\b",
            r"\b(durable|nondurable|manufactured|retail)\s+goods\s+(inflation|prices?|costs?)\b",
            r"\b(inflation|prices?|costs?)\s+(in|of|for)\s+(durable|nondurable|manufactured|retail)\s+goods\b",
            r"\b(motor\s+vehicles?|appliances?|furniture|clothing|apparel)\s+(inflation|prices?|costs?)\b",
            r"\b(inflation|prices?|costs?)\s+(in|of|for)\s+(motor\s+vehicles?|appliances?|furniture|clothing|apparel)\b"
        ],

        # Housing patterns
        "housing_patterns": [
            r"\bhousing\s+(inflation|prices?|costs?|pricing)\b",
            r"\b(inflation|prices?|costs?|pricing)\s+(in|of|for|on)\s+housing\b",
            r"\b(price|cost|pricing)\s+of\s+housing\b",
            r"\bhousing\s+(sector\s+)?inflation\b",
            r"\bhousing\s+component\b",
            r"\b(shelter|rent|rental|residential)\s+(inflation|prices?|costs?)\b",
            r"\b(inflation|prices?|costs?)\s+(in|of|for)\s+(shelter|rent|rental|residential)\b",
            r"\bowners?\s+equivalent\s+rent\b",
            r"\boer\b",
            r"\bprimary\s+residence\s+(prices?|costs?)\b",
            r"\brent\s+inflation\b"
        ],

        # Energy patterns
        "energy_patterns": [
            r"\benergy\s+(inflation|prices?|costs?|pricing)\b",
            r"\b(inflation|prices?|costs?|pricing)\s+(in|of|for|on)\s+energy\b",
            r"\b(price|cost|pricing)\s+of\s+energy\b",
            r"\benergy\s+(sector\s+)?inflation\b",
            r"\benergy\s+component\b",
            r"\b(gasoline|fuel|electricity|natural\s+gas|heating\s+oil|motor\s+fuel)\s+(inflation|prices?|costs?)\b",
            r"\b(inflation|prices?|costs?)\s+(in|of|for)\s+(gasoline|fuel|electricity|natural\s+gas|heating\s+oil|motor\s+fuel)\b",
            r"\b(gas|petroleum\s+products?|utilities)\s+(inflation|prices?|costs?)\b",
            r"\b(inflation|prices?|costs?)\s+(in|of|for)\s+(gas|petroleum\s+products?|utilities)\b"
        ],

        # Food patterns
        "food_patterns": [
            r"\bfood\s+(inflation|prices?|costs?|pricing)\b",
            r"\b(inflation|prices?|costs?|pricing)\s+(in|of|for|on)\s+food\b",
            r"\b(price|cost|pricing)\s+of\s+food\b",
            r"\bfood\s+(sector\s+)?inflation\b",
            r"\bfood\s+component\b",
            r"\b(grocery|restaurant|dining|eating\s+out)\s+(inflation|prices?|costs?)\b",
            r"\b(inflation|prices?|costs?)\s+(in|of|for)\s+(grocery|restaurant|dining|eating\s+out)\b",
            r"\bfood\s+(at\s+home|away\s+from\s+home)\s+(inflation|prices?|costs?)\b",
            r"\b(meat|dairy|produce|beverage|agricultural)\s+(inflation|prices?|costs?)\b",
            r"\b(inflation|prices?|costs?)\s+(in|of|for)\s+(meat|dairy|produce|beverage|agricultural)\b"
        ]
    },

    "Producer Price Index": {
        "ppi_patterns": [
            r"\bproducer\s+price\s+index\b",
            r"\bppi\b",
            r"\bproducer\s+(inflation|prices?|costs?)\b",
            r"\b(inflation|prices?|costs?)\s+(in|of|for)\s+(producer|wholesale|factory|manufacturing|industrial)\b",
            r"\b(wholesale|factory|manufacturing|industrial)\s+(inflation|prices?|costs?)\b",
            r"\b(upstream|finished\s+goods|intermediate\s+demand|final\s+demand)\s+(inflation|prices?|costs?)\b",
            r"\b(inflation|prices?|costs?)\s+(in|of|for)\s+(upstream|finished\s+goods|intermediate\s+demand|final\s+demand)\b",
            r"\bwholesale\s+price\s+index\b",
            r"\binput\s+(prices?|costs?)\b",
            r"\bfactory\s+gate\s+prices?\b"
        ]
    },

    "Wage Inflation": {
        "wage_patterns": [
            r"\b(wage|wages|salary|salaries|pay|earnings|compensation)\s+(inflation|growth|gains?|increases?|rises?)\b",
            r"\b(inflation|growth|gains?|increases?|rises?)\s+(in|of)\s+(wage|wages|salary|salaries|pay|earnings|compensation)\b",
            r"\bwage\s+(inflation|growth|gains?|increases?|rises?|pressures?)\b",
            r"\baverage\s+hourly\s+earnings\b",
            r"\bahe\b",
            r"\bemployment\s+cost\s+index\b",
            r"\beci\b",
            r"\bunit\s+labor\s+costs?\b",
            r"\bulc\b",
            r"\blabor\s+(costs?|cost\s+growth|market\s+pressures?)\b",
            r"\b(costs?|pricing)\s+(in|of|for)\s+labor\b",
            r"\b(tight|tightness)\s+labor\s+market\b",
            r"\bwage\s+(pressures?|spiral)\b",
            r"\bemployee\s+compensation\s+(per\s+hour|growth|increases?)\b",
            r"\bcompensation\s+per\s+hour\b",
            r"\blabor\s+compensation\b",
            r"\bwage\s+growth\b"
        ]
    },

    "Inflation Expectations": {
        "expectations_patterns": [
            r"\binflation\s+expectations?\b",
            r"\bexpected\s+inflation\b",
            r"\binflation\s+(forecasts?|outlook|projections?)\b",
            r"\b(long-term|short-term)\s+(inflation\s+)?expectations?\b",
            r"\bbreakeven\s+(inflation|rates?)\b",
            r"\b(5-year|10-year|5y5y)\s+breakeven\b",
            r"\btips\s+(breakeven|spreads?)\b",
            r"\binflation\s+(swaps?|compensation)\b",
            r"\b(market-based|survey-based)\s+expectations?\b",
            r"\buniversity\s+of\s+michigan\b",
            r"\bmichigan\s+survey\b",
            r"\bsurvey\s+of\s+professional\s+forecasters\b",
            r"\binflation\s+swaps?\b",
            r"\bbreakeven\s+inflation\s+rates?\b",
            r"\b5[\-\s]year\s+breakeven\b",
            r"\b10[\-\s]year\s+breakeven\b",
            r"\binflation\s+indexed\s+securities\b"
        ]
    },

    "Commodity Prices": {
        "commodity_patterns": [
            r"\bcommodity\s+(inflation|prices?|costs?)\b",
            r"\b(inflation|prices?|costs?)\s+(in|of|for)\s+commodities\b",
            r"\benergy\s+(prices?|costs?)\b",
            r"\boil\s+(prices?|costs?)\b",
            r"\b(crude\s+oil|petroleum)\s+(inflation|prices?|costs?)\b",
            r"\b(inflation|prices?|costs?)\s+(in|of|for)\s+(crude\s+oil|oil|petroleum)\b",
            r"\bgas\s+(prices?|costs?)\b",
            r"\bgasoline\s+(prices?|costs?)\b",
            r"\bnatural\s+gas\s+(prices?|costs?)\b",
            r"\braw\s+materials?\s+(inflation|prices?|costs?)\b",
            r"\b(inflation|prices?|costs?)\s+(in|of|for)\s+raw\s+materials?\b",
            r"\bbase\s+metals?\s+(inflation|prices?|costs?)\b",
            r"\bindustrial\s+commodities\s+(inflation|prices?|costs?)\b",
            r"\b(copper|aluminum|steel|iron\s+ore|lumber)\s+(inflation|prices?|costs?)\b",
            r"\b(inflation|prices?|costs?)\s+(in|of|for)\s+(copper|aluminum|steel|iron\s+ore|lumber)\b",
            r"\b(agricultural\s+commodities|crop|grain|wheat|corn|soybean)\s+(inflation|prices?|costs?)\b",
            r"\b(inflation|prices?|costs?)\s+(in|of|for)\s+(agricultural\s+commodities|crop|grain|wheat|corn|soybean)\b",
            r"\bmaterials?\s+(inflation|prices?|costs?)\b",
            r"\b(inflation|prices?|costs?)\s+(in|of|for)\s+materials?\b"
        ]
    }
}



def create_output_directory():
    """Create output directory if it doesn't exist"""
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        print(f"Created output directory: {OUTPUT_DIR}")

def get_inflation_keywords():
    """Create a list of all inflation related keywords"""
    inflation_keywords = ["inflation", "inflating", "inflationary", "price", "prices", "pricing",
                         "cost", "costs", "cpi", "pce", "consumer price", "personal consumption",
                         "price level", "price stability", "deflationary", "deflation", "disinflation",
                         "price pressures", "price pressure"]

    # Add specific terms from our categories
    additional_terms = [
        "core", "excluding", "trimmed mean", "supercore", "headline", "services", "goods",
        "housing", "energy", "food", "producer price", "ppi", "wages", "wage growth",
        "breakeven", "tips", "commodities", "oil prices", "raw materials"
    ]

    inflation_keywords.extend(additional_terms)

    # Remove duplicates
    inflation_keywords = list(set([word.lower() for word in inflation_keywords]))

    return inflation_keywords

def split_into_sentences(text):
    """Split text into sentences using regex"""
    # First replace common abbreviations to avoid splitting them incorrectly
    text = re.sub(r'(\b[A-Z])\. ', r'\1@@ ', text)  # Handle abbreviations like U.S.
    text = re.sub(r'(\b[A-Z][a-z])\. ', r'\1@@ ', text)  # Handle Dr., Mr., etc.

    # Split by sentence boundaries
    sentences = re.split(r'(?<=[.!?])\s+', text)

    # Restore the periods in abbreviations
    sentences = [re.sub(r'@@', '.', s) for s in sentences]

    return sentences

def classify_inflation_sentence_robust(sentence):
    """
    Robust classification using regex patterns to match various grammatical structures.
    Returns a dict with category counts for the sentence.
    """
    sentence_lower = sentence.lower()
    category_counts = {category: 0 for category in robust_inflation_indicators.keys()}

    for category, pattern_groups in robust_inflation_indicators.items():
        for pattern_group, patterns in pattern_groups.items():
            for pattern in patterns:
                if re.search(pattern, sentence_lower):
                    category_counts[category] += 1
                    break  # Only count once per pattern group per sentence

    return category_counts

def get_indicator_counts(text):
    """Get counts of each indicator's patterns in the text"""
    sentences = split_into_sentences(text)

    # Initialize total counts
    total_counts = {indicator: 0 for indicator in robust_inflation_indicators.keys()}

    # Count occurrences in each sentence
    for sentence in sentences:
        sentence_counts = classify_inflation_sentence_robust(sentence)
        for indicator, count in sentence_counts.items():
            total_counts[indicator] += count

    return total_counts

def contains_inflation_keyword(sentence, inflation_keywords):
    """Check if a sentence contains any inflation related keywords"""
    sentence_lower = sentence.lower()

    for keyword in inflation_keywords:
        if re.search(r'\b' + re.escape(keyword.lower()) + r'\b', sentence_lower):
            return True

    return False

def analyze_fed_speeches():
    """Analyze the Fed speeches for inflation content"""
    # Get the list of all CSV files in the speeches folder
    speech_files = glob.glob(os.path.join(SPEECHES_DIR, "*_speeches.csv"))

    if not speech_files:
        print(f"No speech files found in '{SPEECHES_DIR}' folder.")
        return None

    # Get the list of inflation keywords
    inflation_keywords = get_inflation_keywords()

    # List to store all results
    all_results = []

    # Process each official's speech file
    for speech_file in speech_files:
        # Extract official name from filename
        filename = os.path.basename(speech_file)
        official_name = filename.replace('_speeches.csv', '').replace('_', ' ')
        print(f"Processing speeches for {official_name}")

        try:
            # Load the speech data
            official_df = pd.read_csv(speech_file)

            # Skip if no speeches in file
            if official_df.empty:
                print(f"  No speeches found for {official_name}")
                continue

            # Process each speech entry
            for _, row in official_df.iterrows():
                id = row['id']
                date = row['date']
                title = row['title']
                text = row['text']
                role = row['role']
                years_served = row['years_served']

                # Skip if text is missing or too short
                if pd.isna(text) or text == "" or text == "No text available":
                    continue

                # Skip very short texts (likely not real speeches)
                if len(text) < 300:
                    continue

                # Split the text into sentences
                sentences = split_into_sentences(text)

                # Identify inflation related sentences
                inflation_sentences = []
                for sentence in sentences:
                    if contains_inflation_keyword(sentence, inflation_keywords):
                        inflation_sentences.append(sentence)

                # Calculate percentage of speech focused on inflation
                total_sentences = len(sentences)
                inflation_sentences_count = len(inflation_sentences)

                if total_sentences > 0:
                    inflation_percentage = (inflation_sentences_count / total_sentences) * 100
                else:
                    inflation_percentage = 0

                # Combine all inflation sentences
                inflation_text = " ".join(inflation_sentences)

                # Get indicator counts for the inflation portion
                indicator_counts = get_indicator_counts(inflation_text)

                # Calculate percentages for each indicator (reweighted to sum to 100%)
                total_indicator_mentions = sum(indicator_counts.values())

                indicator_percentages = {}
                if total_indicator_mentions > 0:
                    for indicator, count in indicator_counts.items():
                        indicator_percentages[indicator] = (count / total_indicator_mentions) * 100
                else:
                    indicator_percentages = {indicator: 0 for indicator in robust_inflation_indicators.keys()}

                # Create result record
                result = {
                    'id': id,
                    'Official_Name': official_name,
                    'Date': date,
                    'Title': title,
                    'Role': role,
                    'Years_Served': years_served,
                    'Inflation_Percentage': inflation_percentage,
                    'Inflation_Sentences': inflation_sentences_count,
                    'Total_Sentences': total_sentences,
                    'Speech_Length_Chars': len(text)
                }

                # Add indicator percentages
                for indicator, percentage in indicator_percentages.items():
                    result[f'{indicator}_Percentage'] = percentage

                all_results.append(result)

        except Exception as e:
            print(f"Error processing {speech_file}: {e}")

    # Convert to DataFrame
    if all_results:
        results_df = pd.DataFrame(all_results)

        # Convert date to datetime (handle various date formats)
        try:
            results_df['Date'] = pd.to_datetime(results_df['Date'], errors='coerce')
        except:
            print("Warning: Some dates could not be parsed")

        # Sort by official name and date
        results_df = results_df.sort_values(['Official_Name', 'Date'])

        # Add verification column that percentages sum to 100%
        indicator_columns = [f'{indicator}_Percentage' for indicator in robust_inflation_indicators.keys()]
        results_df['Sum_Indicator_Percentages'] = results_df[indicator_columns].sum(axis=1)

        # Create output directory
        create_output_directory()

        # Save to CSV
        output_file = os.path.join(OUTPUT_DIR, 'fed_inflation_analysis.csv')
        results_df.to_csv(output_file, index=False)
        print(f"Analysis completed. {len(results_df)} speech entries analyzed and saved to {output_file}")

        return results_df
    else:
        print("No results generated.")
        return None

def generate_fed_summaries(results_df):
    """Generate summary statistics for each Fed official"""
    if results_df is None or results_df.empty:
        print("No results to summarize.")
        return

    # Group by official name
    grouped = results_df.groupby('Official_Name')

    # Create summary dataframe
    summaries = []

    for official_name, group in grouped:
        # Calculate averages
        avg_inflation_percentage = group['Inflation_Percentage'].mean()
        total_speeches = len(group)
        inflation_focused_speeches = sum(group['Inflation_Percentage'] > 0)

        # Get role and years served (should be consistent for each official)
        role = group['Role'].iloc[0] if not group['Role'].empty else "Unknown"
        years_served = group['Years_Served'].iloc[0] if not group['Years_Served'].empty else "Unknown"

        # Calculate date range of speeches
        valid_dates = group['Date'].dropna()
        if not valid_dates.empty:
            date_range = f"{valid_dates.min().strftime('%Y-%m-%d')} to {valid_dates.max().strftime('%Y-%m-%d')}"
        else:
            date_range = "Unknown"

        # Calculate average speech length
        avg_speech_length = group['Speech_Length_Chars'].mean()

        # Calculate average indicator percentages
        indicator_averages = {}
        for indicator in robust_inflation_indicators.keys():
            col = f'{indicator}_Percentage'
            # Only consider speeches where inflation was discussed
            inflation_speeches = group[group['Inflation_Percentage'] > 0]
            if not inflation_speeches.empty:
                indicator_averages[indicator] = inflation_speeches[col].mean()
            else:
                indicator_averages[indicator] = 0

        # Create summary record
        summary = {
            'Official_Name': official_name,
            'Role': role,
            'Years_Served': years_served,
            'Date_Range': date_range,
            'Total_Speeches': total_speeches,
            'Inflation_Focused_Speeches': inflation_focused_speeches,
            'Inflation_Focus_Rate': inflation_focused_speeches / total_speeches * 100 if total_speeches > 0 else 0,
            'Avg_Inflation_Percentage': avg_inflation_percentage,
            'Avg_Speech_Length_Chars': avg_speech_length
        }

        # Add indicator averages
        for indicator, avg in indicator_averages.items():
            summary[f'Avg_{indicator}_Percentage'] = avg

        summaries.append(summary)

    # Convert to DataFrame
    summaries_df = pd.DataFrame(summaries)

    # Sort by average inflation percentage
    summaries_df = summaries_df.sort_values('Avg_Inflation_Percentage', ascending=False)

    # Save to CSV
    output_file = os.path.join(OUTPUT_DIR, 'fed_inflation_summaries.csv')
    summaries_df.to_csv(output_file, index=False)
    print(f"Generated summaries for {len(summaries_df)} officials saved to {output_file}")

    return summaries_df

def filter_inflation_speeches(results_df, min_percentage=0):
    """Filter to only include speech entries that discuss inflation"""
    if results_df is None or results_df.empty:
        print("No results to filter.")
        return None

    # Filter for entries that have inflation content
    print(f"\nFiltering speeches with inflation percentage > {min_percentage}%...")

    original_count = len(results_df)
    filtered_df = results_df[results_df['Inflation_Percentage'] > min_percentage].copy()
    filtered_count = len(filtered_df)

    print(f"Before filtering: {original_count} speeches")
    print(f"After filtering: {filtered_count} speeches ({filtered_count/original_count*100:.1f}%)")

    # Reset index
    filtered_df = filtered_df.reset_index(drop=True)

    # Save the filtered results
    output_file = os.path.join(OUTPUT_DIR, 'inflation_content_speeches.csv')
    filtered_df.to_csv(output_file, index=False)
    print(f"Filtered results saved to {output_file}")

    return filtered_df

def main():
    """Main function to run the Fed speech analysis"""
    print("Analyzing Fed officials' speeches for inflation content...")
    print(f"Looking for speech files in: {SPEECHES_DIR}")

    # Run the analysis
    results_df = analyze_fed_speeches()

    if results_df is not None:
        # Generate official summaries
        summaries_df = generate_fed_summaries(results_df)

        # Filter to only include speeches with inflation content
        filtered_df = filter_inflation_speeches(results_df, min_percentage=0)

        # Print overall statistics
        print("\n" + "="*60)
        print("OVERALL STATISTICS")
        print("="*60)
        print(f"Total speech entries analyzed: {len(results_df):,}")

        inflation_speeches = results_df[results_df['Inflation_Percentage'] > 0]
        print(f"Speeches with inflation content: {len(inflation_speeches):,} ({len(inflation_speeches)/len(results_df)*100:.1f}%)")

        print(f"Average percentage of speech about inflation: {results_df['Inflation_Percentage'].mean():.2f}%")

        # Top officials by inflation focus
        if summaries_df is not None:
            print(f"\nTop 5 Officials by Inflation Focus:")
            top_officials = summaries_df.head(5)
            for _, row in top_officials.iterrows():
                print(f"  {row['Official_Name']}: {row['Avg_Inflation_Percentage']:.1f}% (Role: {row['Role']})")

        # Calculate average percentages for each indicator (only for speeches with inflation content)
        if not inflation_speeches.empty:
            print(f"\nAverage Indicator Percentages (for speeches with inflation content):")
            for indicator in robust_inflation_indicators.keys():
                col = f'{indicator}_Percentage'
                avg_percentage = inflation_speeches[col].mean()
                print(f"  {indicator}: {avg_percentage:.1f}%")

        print(f"\nAll analysis files saved to: {OUTPUT_DIR}")

    else:
        print("Analysis failed. Please check that speech CSV files exist in the specified directory.")

if __name__ == "__main__":
    main()

Analyzing Fed officials' speeches for inflation content...
Looking for speech files in: /content/drive/MyDrive/FedComs/Speeches
Processing speeches for Edward Gramlich
Processing speeches for Roger Ferguson
Processing speeches for Laurence Meyer
Processing speeches for Donald Kohn
Processing speeches for Ben Bernanke
Processing speeches for Janet Yellen
Processing speeches for Jerome Powell
Processing speeches for Randall Kroszner
Processing speeches for Kevin Warsh
Processing speeches for Frederic Mishkin
Processing speeches for Elizabeth Duke
Processing speeches for Daniel Tarullo
Processing speeches for Sarah Bloom Raskin
Processing speeches for Lael Brainard
Processing speeches for Stanley Fischer
Processing speeches for Jeremy Stein
Processing speeches for Michelle Bowman
Processing speeches for Richard Clarida
Processing speeches for Randal Quarles
Processing speeches for Christopher Waller
Processing speeches for Michael Barr
Processing speeches for Lisa Cook
Processing speeches