In [None]:
import os
import pandas as pd
import re
import numpy as np
import glob
from collections import Counter

# Define the labor market indicators and their related keywords
INDICATORS = {
    "Employment": ["employed", "employment", "employment-to-population", "employment to population", "employment/population", "e/p ratio", "employment-population ratio"],
    "Unemployment": ["unemployment", "unemployed", "jobless", "joblessness", "u-3", "u3 rate"],
    "Participation": ["participation", "marginally attached", "discouraged workers", "retirement", "labor force", "want a job", "labor force participation", "participation rate", "lfpr", "labor market participation"],
    "Wages": ["pay", "eci", "labor cost", "labor costs", "wage", "wages", "wage growth", "wage inflation", "compensation", "earnings", "salary", "salaries"],
    "Vacancies": ["vacancy", "vacancies", "job opening", "job openings", "jolts", "unfilled position", "open position", "vacant job", "vacancy rate", "job vacancy", "posted opening", "help wanted", "available job", "job posting", "position opening", "vacancy yield", "beveridge curve", "job advertisement", "job availability", "hiring difficulty"],
    "Quits": ["quit", "quits", "resignation", "resignations", "voluntary separation", "turnover", "voluntary turnover", "job changing", "job switching", "job hopping", "job-to-job"],
    "Layoffs": ["layoff", "layoffs", "involuntary separation", "dismissal", "dismissals", "discharged", "discharge", "downsizing", "redundancy", "workforce reduction", "job cuts", "reduction in force", "rif", "termination", "fired", "job loss", "permanent layoff", "temporary layoff", "furlough", "mass layoff", "involuntary job loss", "job destruction", "establishment closure", "job shedding"],
    "Hiring": ["hiring", "job finding", "payroll", "nonfarm payroll", "job gain", "job growth", "employment gain", "employment growth", "job creation"]
}

def get_labor_market_keywords():
    """Create a list of all labor market related keywords"""
    labor_keywords = ["labor market", "job market", "employment", "unemployment", "labor",
                    "labor markets", "job markets", "hiring", "workers", "unemployment",
                    "employment situation", "labor force", "job", "jobs"]

    # Add all indicator keywords to the labor market keywords
    for keywords in INDICATORS.values():
        labor_keywords.extend(keywords)

    return labor_keywords

def split_into_sentences(text):
    """Split text into sentences using regex"""
    # First replace common abbreviations to avoid splitting them incorrectly
    text = re.sub(r'(\b[A-Z])\. ', r'\1@@ ', text)  # Handle abbreviations like U.S.
    text = re.sub(r'(\b[A-Z][a-z])\. ', r'\1@@ ', text)  # Handle Dr., Mr., etc.

    # Split by sentence boundaries
    sentences = re.split(r'(?<=[.!?])\s+', text)

    # Restore the periods in abbreviations
    sentences = [re.sub(r'@@', '.', s) for s in sentences]

    return sentences

def get_indicator_counts(text):
    """Get counts of each indicator's keywords in the text"""
    text_lower = text.lower()

    indicator_counts = {indicator: 0 for indicator in INDICATORS}

    for indicator, keywords in INDICATORS.items():
        for keyword in keywords:
            # Use word boundaries to avoid partial matches
            matches = re.findall(r'\b' + re.escape(keyword.lower()) + r'\b', text_lower)
            count = len(matches)

            # Special handling for "employment" - exclude "maximum employment"
            if keyword.lower() == "employment":
                # Count how many times "maximum employment" appears and subtract from employment count
                max_employment_count = len(re.findall(r'\bmaximum employment\b', text_lower))
                count = max(0, count - max_employment_count)

            indicator_counts[indicator] += count

    return indicator_counts

def contains_labor_market_keyword(sentence, labor_keywords):
    """Check if a sentence contains any labor market related keywords"""
    sentence_lower = sentence.lower()

    for keyword in labor_keywords:
        if re.search(r'\b' + re.escape(keyword.lower()) + r'\b', sentence_lower):
            return True

    return False

def analyze_official_transcripts():
    """Analyze the transcripts of officials for labor market content"""
    # Get the list of all CSV files in the official_transcripts folder
    transcript_files = glob.glob("official_transcripts/*.csv")

    if not transcript_files:
        print("No transcript files found in 'official_transcripts' folder.")
        return None

    # Get the list of labor market keywords
    labor_keywords = get_labor_market_keywords()

    # List to store all results
    all_results = []

    # Process each official's transcript file
    for transcript_file in transcript_files:
        official_name = os.path.basename(transcript_file).replace('.csv', '').replace('_', ' ')
        print(f"Processing transcripts for {official_name}")

        try:
            # Load the transcript data
            official_df = pd.read_csv(transcript_file)

            # Process each transcript entry
            for _, row in official_df.iterrows():
                date = row['Date']
                text = row['Text']

                # Skip if text is missing
                if pd.isna(text) or text == "":
                    continue

                # Split the text into sentences
                sentences = split_into_sentences(text)

                # Identify labor market related sentences
                labor_sentences = []
                for sentence in sentences:
                    if contains_labor_market_keyword(sentence, labor_keywords):
                        labor_sentences.append(sentence)

                # Calculate percentage of speech focused on labor markets
                total_sentences = len(sentences)
                labor_sentences_count = len(labor_sentences)

                if total_sentences > 0:
                    labor_market_percentage = (labor_sentences_count / total_sentences) * 100
                else:
                    labor_market_percentage = 0

                # Combine all labor market sentences
                labor_market_text = " ".join(labor_sentences)

                # Get indicator counts for the labor market portion
                indicator_counts = get_indicator_counts(labor_market_text)

                # Calculate percentages for each indicator (reweighted to sum to 100%)
                total_indicator_mentions = sum(indicator_counts.values())

                indicator_percentages = {}
                if total_indicator_mentions > 0:
                    for indicator, count in indicator_counts.items():
                        indicator_percentages[indicator] = (count / total_indicator_mentions) * 100
                else:
                    indicator_percentages = {indicator: 0 for indicator in INDICATORS}

                # Create result record
                result = {
                    'Official_Name': official_name,
                    'Date': date,
                    'Labor_Market_Percentage': labor_market_percentage,
                    'Labor_Sentences': labor_sentences_count,
                    'Total_Sentences': total_sentences
                }

                # Add indicator percentages
                for indicator, percentage in indicator_percentages.items():
                    result[f'{indicator}_Percentage'] = percentage

                all_results.append(result)

        except Exception as e:
            print(f"Error processing {transcript_file}: {e}")

    # Convert to DataFrame
    if all_results:
        results_df = pd.DataFrame(all_results)

        # Convert date to datetime
        results_df['Date'] = pd.to_datetime(results_df['Date'])

        # Sort by official name and date
        results_df = results_df.sort_values(['Official_Name', 'Date'])

        # Add verification column that percentages sum to 100%
        indicator_columns = [f'{indicator}_Percentage' for indicator in INDICATORS]
        results_df['Sum_Indicator_Percentages'] = results_df[indicator_columns].sum(axis=1)

        # Save to CSV
        results_df.to_csv('official_labor_market_analysis.csv', index=False, encoding='utf-8')
        print(f"Analysis completed. {len(results_df)} transcript entries analyzed and saved to official_labor_market_analysis.csv")

        return results_df
    else:
        print("No results generated.")
        return None

def generate_official_summaries(results_df):
    """Generate summary statistics for each official"""
    if results_df is None or results_df.empty:
        print("No results to summarize.")
        return

    # Group by official name
    grouped = results_df.groupby('Official_Name')

    # Create summary dataframe
    summaries = []

    for official_name, group in grouped:
        # Calculate averages
        avg_labor_percentage = group['Labor_Market_Percentage'].mean()
        total_entries = len(group)
        labor_focused_entries = sum(group['Labor_Market_Percentage'] > 0)

        # Calculate average indicator percentages
        indicator_averages = {}
        for indicator in INDICATORS:
            col = f'{indicator}_Percentage'
            # Only consider entries where labor market was discussed
            labor_entries = group[group['Labor_Market_Percentage'] > 0]
            if not labor_entries.empty:
                indicator_averages[indicator] = labor_entries[col].mean()
            else:
                indicator_averages[indicator] = 0

        # Create summary record
        summary = {
            'Official_Name': official_name,
            'Total_Entries': total_entries,
            'Labor_Focused_Entries': labor_focused_entries,
            'Avg_Labor_Market_Percentage': avg_labor_percentage
        }

        # Add indicator averages
        for indicator, avg in indicator_averages.items():
            summary[f'Avg_{indicator}_Percentage'] = avg

        summaries.append(summary)

    # Convert to DataFrame
    summaries_df = pd.DataFrame(summaries)

    # Sort by average labor market percentage (descending)
    summaries_df = summaries_df.sort_values('Avg_Labor_Market_Percentage', ascending=False)

    # Save to CSV
    summaries_df.to_csv('official_labor_market_summaries.csv', index=False, encoding='utf-8')
    print(f"Generated summaries for {len(summaries_df)} officials saved to official_labor_market_summaries.csv")

    return summaries_df

def filter_labor_market_transcripts(results_df, min_percentage=0):
    """Filter to only include transcript entries that discuss labor markets"""
    if results_df is None or results_df.empty:
        print("No results to filter.")
        return None

    # Filter for entries that have labor market content
    filtered_df = results_df[results_df['Labor_Market_Percentage'] > min_percentage].copy()

    # Reset index
    filtered_df = filtered_df.reset_index(drop=True)

    # Filter out entries with no labor market content
    print("\nFiltering out entries with no labor market content...")
    results_df = results_df[results_df['Labor_Market_Percentage'] > 0]
    results_df = results_df.reset_index(drop=True)
    print(f"After filtering: {len(results_df)} entries with labor market content retained")

    # Now save the filtered results
    results_df.to_csv('labor_market_analysis.csv', index=False)

    return filtered_df

def main():
    """Main function to run the analysis"""
    print("Analyzing official transcripts for labor market content...")

    # Run the analysis
    results_df = analyze_official_transcripts()

    if results_df is not None:
        # Generate official summaries
        generate_official_summaries(results_df)

        # Filter to only include entries with labor market content
        filter_labor_market_transcripts(results_df, min_percentage=0)

        # Print overall statistics
        print("\nOverall Statistics:")
        print(f"Total transcript entries analyzed: {len(results_df)}")

        labor_entries = results_df[results_df['Labor_Market_Percentage'] > 0]
        print(f"Entries with labor market content: {len(labor_entries)} ({len(labor_entries)/len(results_df)*100:.2f}%)")

        print(f"Average percentage of transcript about labor markets: {results_df['Labor_Market_Percentage'].mean():.2f}%")

        # Calculate average percentages for each indicator (only for entries with labor content)
        if not labor_entries.empty:
            print("\nAverage Indicator Percentages (for entries with labor market content):")
            for indicator in INDICATORS:
                col = f'{indicator}_Percentage'
                avg_percentage = labor_entries[col].mean()
                print(f"Average for {indicator}: {avg_percentage:.2f}%")

if __name__ == "__main__":
    main()

Analyzing official transcripts for labor market content...
Processing transcripts for James Bullard
Processing transcripts for Charles L Evans
Processing transcripts for Richard W Fisher
Processing transcripts for Dennis P Lockhart
Processing transcripts for Jeffrey M Lacker
Processing transcripts for Sandra Pianalto
Processing transcripts for Thomas M Hoenig
Processing transcripts for William Poole
Processing transcripts for Gary H Stern
Processing transcripts for Charles I Plosser
Processing transcripts for Eric S Rosengren
Processing transcripts for William C Dudley
Processing transcripts for Narayana Kocherlakota
Processing transcripts for Janet Yellen
Processing transcripts for John C Williams
Processing transcripts for Esther L George
Processing transcripts for Jerome Powell
Processing transcripts for Loretta Mester
Processing transcripts for Patrick T Harker
Processing transcripts for Robert Steven Kaplan
Processing transcripts for Neel Kashkari
Processing transcripts for Raphae