In [10]:
"""
Fed Communications Analysis
Produces summary statistics, LaTeX tables, and time series graphs
"""

from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION
# ============================================================================

BASE_DIR = '/content/drive/MyDrive/FedComs'
OUTPUT_DIR = f'{BASE_DIR}/SummaryStats'
DATA_FILE = f'{OUTPUT_DIR}/Combined_Dataset.csv'

# Toggle for including variance in tables (set to False to hide variance)
INCLUDE_VARIANCE = True

# Variables of interest
CORE_VARS = [
    'total_sentences',
    'sentences_on_labor',
    'sentences_on_inflation',
    'labor_share_of_labor_inflation_sentences'
]

LABOR_EMPHASIS_VARS = [
    'labor_emphasis_Employment',
    'labor_emphasis_Unemployment',
    'labor_emphasis_Participation',
    'labor_emphasis_Wages',
    'labor_emphasis_Vacancies',
    'labor_emphasis_Quits',
    'labor_emphasis_Layoffs',
    'labor_emphasis_Hiring'
]

INFLATION_EMPHASIS_VARS = [
    'inflation_emphasis_Commodity_Prices',
    'inflation_emphasis_Core',
    'inflation_emphasis_Core_CPI',
    'inflation_emphasis_Core_PCE',
    'inflation_emphasis_Energy',
    'inflation_emphasis_Food',
    'inflation_emphasis_Goods',
    'inflation_emphasis_Headline',
    'inflation_emphasis_Headline_CPI',
    'inflation_emphasis_Headline_PCE',
    'inflation_emphasis_Housing',
    'inflation_emphasis_Inflation_Expectations',
    'inflation_emphasis_PPI',
    'inflation_emphasis_Services',
    'inflation_emphasis_Wage_Inflation'
]

ALL_EMPHASIS_VARS = LABOR_EMPHASIS_VARS + INFLATION_EMPHASIS_VARS

# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================

def clean_numeric(df, cols):
    """Convert columns to numeric, replacing '.' with 0"""
    for col in cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col].replace('.', 0), errors='coerce').fillna(0)
    return df

def compute_shares(df):
    """Compute labor and inflation shares of total sentences"""
    df = df.copy()

    # Avoid division by zero
    df['labor_share'] = np.where(
        df['total_sentences'] > 0,
        100 * df['sentences_on_labor'] / df['total_sentences'],
        0
    )

    df['inflation_share'] = np.where(
        df['total_sentences'] > 0,
        100 * df['sentences_on_inflation'] / df['total_sentences'],
        0
    )

    return df

def clr_transform(composition):
    """
    Apply Centered Log-Ratio (CLR) transformation to compositional data

    Parameters:
    -----------
    composition : array-like, shape (n_samples, n_components)
        Compositional data where rows sum to 1 (or constant)

    Returns:
    --------
    clr_data : array, shape (n_samples, n_components)
        CLR-transformed data
    """
    composition = np.array(composition)

    # Replace zeros with small value to avoid log(0)
    composition = np.where(composition == 0, 1e-10, composition)

    # Compute geometric mean for each row
    geo_mean = np.exp(np.mean(np.log(composition), axis=1, keepdims=True))

    # CLR transformation
    clr_data = np.log(composition / geo_mean)

    return clr_data

def compute_clr_variance(df, emphasis_vars):
    """
    Compute variance of CLR-transformed emphasis vectors

    Returns variance of each component after CLR transformation
    """
    # Extract emphasis data
    emphasis_data = df[emphasis_vars].values

    # Apply CLR transformation
    clr_data = clr_transform(emphasis_data)

    # Compute variance for each component
    variances = np.var(clr_data, axis=0)

    return variances, clr_data

def compute_euclidean_distance_variance(df, emphasis_vars, mean_vector=None):
    """
    Compute variance of Euclidean distances from mean emphasis vector

    Parameters:
    -----------
    df : DataFrame
        Data containing emphasis variables
    emphasis_vars : list
        List of emphasis variable names
    mean_vector : array, optional
        Mean vector to compute distance from. If None, uses sample mean.

    Returns:
    --------
    variance : float
        Variance of distances
    distances : array
        Individual distances from mean
    """
    # Extract emphasis data
    emphasis_data = df[emphasis_vars].values

    # Apply CLR transformation
    clr_data = clr_transform(emphasis_data)

    # Compute mean vector if not provided
    if mean_vector is None:
        mean_vector = np.mean(clr_data, axis=0)

    # Compute Euclidean distance for each observation
    distances = np.sqrt(np.sum((clr_data - mean_vector)**2, axis=1))

    # Compute variance of distances
    variance = np.var(distances)

    return variance, distances, mean_vector

def format_mean_var(mean_val, var_val, is_percentage=True, decimals_mean=1, decimals_var=2):
    """Format mean and variance for LaTeX table"""
    if is_percentage:
        if INCLUDE_VARIANCE:
            return f"{mean_val:.{decimals_mean}f} ({var_val:.{decimals_var}f})"
        else:
            return f"{mean_val:.{decimals_mean}f}"
    else:
        if INCLUDE_VARIANCE:
            return f"{mean_val:.{decimals_mean}f} ({var_val:.{decimals_var}f})"
        else:
            return f"{mean_val:.{decimals_mean}f}"

def clean_variable_name(var):
    """Clean up variable name for display in tables"""
    clean_name = var.replace('_', ' ').replace('labor emphasis', 'L:').replace('inflation emphasis', 'I:')
    clean_name = clean_name.replace('labor share', 'Lab. share').replace('inflation share', 'Infl. share')
    # Special case for labor share of labor inflation sentences
    if var == 'labor_share_of_labor_inflation_sentences':
        clean_name = 'Lab. share, lab + infl'
    return clean_name

def compute_year_weighted_mean(data, var):
    """
    Compute mean by first averaging within each year, then averaging across years.
    This gives equal weight to each year regardless of number of observations.

    Parameters:
    -----------
    data : DataFrame
        Data to compute mean from
    var : str
        Variable name to compute mean for

    Returns:
    --------
    float : Year-weighted mean
    """
    if len(data) == 0:
        return np.nan

    # Remove rows with missing year
    data_with_year = data[data['year'].notna()].copy()

    if len(data_with_year) == 0:
        return np.nan

    # Compute mean within each year
    yearly_means = data_with_year.groupby('year')[var].mean()

    # Compute mean across years (equal weight to each year)
    year_weighted_mean = yearly_means.mean()

    return year_weighted_mean

def compute_year_weighted_var(data, var):
    """
    Compute variance using year-weighted approach.
    First compute within-year means, then compute variance of those yearly means.

    Parameters:
    -----------
    data : DataFrame
        Data to compute variance from
    var : str
        Variable name to compute variance for

    Returns:
    --------
    float : Year-weighted variance
    """
    if len(data) == 0:
        return np.nan

    # Remove rows with missing year
    data_with_year = data[data['year'].notna()].copy()

    if len(data_with_year) == 0:
        return np.nan

    # Compute mean within each year
    yearly_means = data_with_year.groupby('year')[var].mean()

    if len(yearly_means) < 2:
        return np.nan

    # Compute variance across years
    year_weighted_var = yearly_means.var()

    return year_weighted_var

# ============================================================================
# LOAD AND PREPARE DATA
# ============================================================================

print("\n" + "="*70)
print("LOADING DATA")
print("="*70)

df = pd.read_csv(DATA_FILE)
print(f"Loaded {len(df)} records")

# Convert dates
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['year'] = pd.to_numeric(df['year'], errors='coerce')
df['month'] = pd.to_numeric(df['month'], errors='coerce')

# Create quarter variable
df['quarter'] = df['date'].dt.to_period('Q')

# Clean numeric columns
all_numeric_cols = CORE_VARS + ALL_EMPHASIS_VARS
df = clean_numeric(df, all_numeric_cols)

# Compute shares
df = compute_shares(df)

# Convert labor_share_of_labor_inflation_sentences to percentage
df['labor_share_of_labor_inflation_sentences'] = pd.to_numeric(
    df['labor_share_of_labor_inflation_sentences'].replace('.', 0),
    errors='coerce'
).fillna(0) * 100

print(f"\nDate range: {df['date'].min()} to {df['date'].max()}")
print(f"Records by communication type:")
print(df['com_type'].value_counts())

# ============================================================================
# DROP ROWS WITH NO LABOR OR INFLATION MENTIONS
# ============================================================================

print("\n" + "="*70)
print("FILTERING DATA: DROPPING ROWS WITH NO LABOR/INFLATION MENTIONS")
print("="*70)

# Calculate sums to identify zero vectors
df['labor_emphasis_sum'] = df[LABOR_EMPHASIS_VARS].sum(axis=1)
df['inflation_emphasis_sum'] = df[INFLATION_EMPHASIS_VARS].sum(axis=1)

# Keep rows that have EITHER labor OR inflation mentions (or both)
df_original_len = len(df)
df = df[(df['labor_emphasis_sum'] > 0) | (df['inflation_emphasis_sum'] > 0)].copy()

print(f"\nOriginal rows: {df_original_len}")
print(f"Rows dropped: {df_original_len - len(df)}")
print(f"Remaining rows: {len(df)}")
print(f"\nRemaining records by communication type:")
print(df['com_type'].value_counts())

# Drop the sum columns (no longer needed)
df = df.drop(['labor_emphasis_sum', 'inflation_emphasis_sum'], axis=1)

# ============================================================================
# TABLE 1: AVERAGES ACROSS COMMUNICATION TYPES
# ============================================================================

print("\n" + "="*70)
print("TABLE 1: AVERAGES ACROSS COMMUNICATION TYPES (YEAR-WEIGHTED)")
print("="*70)

# Variables to summarize
summary_vars = ['total_sentences', 'labor_share', 'inflation_share',
                'labor_share_of_labor_inflation_sentences'] + ALL_EMPHASIS_VARS

results_1 = []

# Process each communication type
for com_type in ['statements', 'minutes', 'transcripts', 'speeches', 'presscon']:
    subset = df[df['com_type'] == com_type]

    if len(subset) == 0:
        continue

    # Overall mean for this type
    display_name = com_type.capitalize()
    if com_type == 'presscon':
        display_name = 'Press'

    row = {'Group': display_name}
    for var in summary_vars:
        row[f'{var}_mean'] = compute_year_weighted_mean(subset, var)
        row[f'{var}_var'] = compute_year_weighted_var(subset, var)
    results_1.append(row)

    # For press conferences, break down by chair vs other
    if com_type == 'presscon':
        # Chair
        chair_subset = subset[subset['role'] == 'Chair']
        if len(chair_subset) > 0:
            row = {'Group': 'Press: Chair'}
            for var in summary_vars:
                row[f'{var}_mean'] = compute_year_weighted_mean(chair_subset, var)
                row[f'{var}_var'] = compute_year_weighted_var(chair_subset, var)
            results_1.append(row)

        # Other (non-chair)
        other_subset = subset[subset['role'] != 'Chair']
        if len(other_subset) > 0:
            row = {'Group': 'Press: Other'}
            for var in summary_vars:
                row[f'{var}_mean'] = compute_year_weighted_mean(other_subset, var)
                row[f'{var}_var'] = compute_year_weighted_var(other_subset, var)
            results_1.append(row)

results_1_df = pd.DataFrame(results_1)

# Create transposed LaTeX table for MEANS
latex_1_means = "\\begin{table}[htbp]\n\\centering\n\\small\n"
latex_1_means += "\\caption{Means Across Communication Types (Year-Weighted)}\n"
latex_1_means += "\\label{tab:comm_types_means}\n"
latex_1_means += "\\begin{tabular}{l" + "r" * len(results_1_df) + "}\n"
latex_1_means += "\\hline\\hline\n"

# Header row with group names
header = "Variable"
for group in results_1_df['Group']:
    header += f" & {group}"
header += " \\\\\n"
latex_1_means += header
latex_1_means += "\\hline\n"

# Data rows - one per variable
for var in summary_vars:
    clean_name = clean_variable_name(var)

    line = clean_name
    for _, row in results_1_df.iterrows():
        mean_val = row[f'{var}_mean']
        # Multiply emphasis variables by 100 for display
        if 'emphasis' in var:
            mean_val = mean_val * 100
        is_pct = var not in ['total_sentences']
        decimals = 1 if is_pct else 0
        line += f" & {mean_val:.{decimals}f}"
    line += " \\\\\n"
    latex_1_means += line

latex_1_means += "\\hline\\hline\n"
latex_1_means += "\\end{tabular}\n"
latex_1_means += "\\begin{minipage}{\\textwidth}\n"
latex_1_means += "\\small\n"
latex_1_means += "Note: Means computed by averaging within years, then across years.\n"
latex_1_means += "\\end{minipage}\n"
latex_1_means += "\\end{table}\n"

# Create transposed LaTeX table for VARIANCES
if INCLUDE_VARIANCE:
    latex_1_var = "\\begin{table}[htbp]\n\\centering\n\\small\n"
    latex_1_var += "\\caption{Variances Across Communication Types (Year-Weighted)}\n"
    latex_1_var += "\\label{tab:comm_types_var}\n"
    latex_1_var += "\\begin{tabular}{l" + "r" * len(results_1_df) + "}\n"
    latex_1_var += "\\hline\\hline\n"
    latex_1_var += header
    latex_1_var += "\\hline\n"

    # Data rows - one per variable
    for var in summary_vars:
        clean_name = clean_variable_name(var)

        line = clean_name
        for _, row in results_1_df.iterrows():
            var_val = row[f'{var}_var']
            line += f" & {var_val:.2f}"
        line += " \\\\\n"
        latex_1_var += line

    latex_1_var += "\\hline\\hline\n"
    latex_1_var += "\\end{tabular}\n"
    latex_1_var += "\\begin{minipage}{\\textwidth}\n"
    latex_1_var += "\\small\n"
    latex_1_var += "Note: Variance of yearly means.\n"
    latex_1_var += "\\end{minipage}\n"
    latex_1_var += "\\end{table}\n"

# Save tables
with open(f'{OUTPUT_DIR}/table1_comm_types_means.tex', 'w') as f:
    f.write(latex_1_means)

if INCLUDE_VARIANCE:
    with open(f'{OUTPUT_DIR}/table1_comm_types_var.tex', 'w') as f:
        f.write(latex_1_var)

print("✓ Table 1 saved (year-weighted means and variances)")

# ============================================================================
# TABLE 1A: AVERAGES 2006-2019 (EXCLUDING PRESS CONFERENCES, YEAR-WEIGHTED)
# ============================================================================

print("\n" + "="*70)
print("TABLE 1A: AVERAGES 2006-2019 (EXCLUDING PRESS CONFERENCES, YEAR-WEIGHTED)")
print("="*70)

# Filter data
df_1a = df[(df['year'] >= 2006) & (df['year'] <= 2019) & (df['com_type'] != 'presscon')]

results_1a = []

for com_type in ['statements', 'minutes', 'transcripts', 'speeches']:
    subset = df_1a[df_1a['com_type'] == com_type]

    if len(subset) == 0:
        continue

    row = {'Group': com_type.capitalize()}
    for var in summary_vars:
        row[f'{var}_mean'] = compute_year_weighted_mean(subset, var)
        row[f'{var}_var'] = compute_year_weighted_var(subset, var)
    results_1a.append(row)

results_1a_df = pd.DataFrame(results_1a)

# Create transposed LaTeX table for MEANS
latex_1a_means = "\\begin{table}[htbp]\n\\centering\n\\small\n"
latex_1a_means += "\\caption{Means Across Communication Types (2006-2019, Excluding Press Conferences, Year-Weighted)}\n"
latex_1a_means += "\\label{tab:comm_types_2006_2019_means}\n"
latex_1a_means += "\\begin{tabular}{l" + "r" * len(results_1a_df) + "}\n"
latex_1a_means += "\\hline\\hline\n"

# Header
header_1a = "Variable"
for group in results_1a_df['Group']:
    header_1a += f" & {group}"
header_1a += " \\\\\n"
latex_1a_means += header_1a
latex_1a_means += "\\hline\n"

# Data rows
for var in summary_vars:
    clean_name = clean_variable_name(var)

    line = clean_name
    for _, row in results_1a_df.iterrows():
        mean_val = row[f'{var}_mean']
        # Multiply emphasis variables by 100 for display
        if 'emphasis' in var:
            mean_val = mean_val * 100
        is_pct = var not in ['total_sentences']
        decimals = 1 if is_pct else 0
        line += f" & {mean_val:.{decimals}f}"
    line += " \\\\\n"
    latex_1a_means += line

latex_1a_means += "\\hline\\hline\n"
latex_1a_means += "\\end{tabular}\n"
latex_1a_means += "\\end{table}\n"

# Create transposed LaTeX table for VARIANCES
if INCLUDE_VARIANCE:
    latex_1a_var = "\\begin{table}[htbp]\n\\centering\n\\small\n"
    latex_1a_var += "\\caption{Variances Across Communication Types (2006-2019, Excluding Press Conferences, Year-Weighted)}\n"
    latex_1a_var += "\\label{tab:comm_types_2006_2019_var}\n"
    latex_1a_var += "\\begin{tabular}{l" + "r" * len(results_1a_df) + "}\n"
    latex_1a_var += "\\hline\\hline\n"
    latex_1a_var += header_1a
    latex_1a_var += "\\hline\n"

    for var in summary_vars:
        clean_name = clean_variable_name(var)

        line = clean_name
        for _, row in results_1a_df.iterrows():
            var_val = row[f'{var}_var']
            line += f" & {var_val:.2f}"
        line += " \\\\\n"
        latex_1a_var += line

    latex_1a_var += "\\hline\\hline\n"
    latex_1a_var += "\\end{tabular}\n"
    latex_1a_var += "\\end{table}\n"

# Save tables
with open(f'{OUTPUT_DIR}/table1a_comm_types_2006_2019_means.tex', 'w') as f:
    f.write(latex_1a_means)

if INCLUDE_VARIANCE:
    with open(f'{OUTPUT_DIR}/table1a_comm_types_2006_2019_var.tex', 'w') as f:
        f.write(latex_1a_var)

print("✓ Table 1a saved (means and variances)")

# ============================================================================
# TABLE 2: PUBLIC STATEMENTS VS PRIVATE (YEAR-WEIGHTED)
# ============================================================================

print("\n" + "="*70)
print("TABLE 2: PUBLIC STATEMENTS VS PRIVATE (YEAR-WEIGHTED)")
print("="*70)

results_2 = []

# Official public: statements + minutes
official_public = df[df['com_type'].isin(['statements', 'minutes'])]
row = {'Group': 'Official Public'}
for var in summary_vars:
    row[f'{var}_mean'] = compute_year_weighted_mean(official_public, var)
    row[f'{var}_var'] = compute_year_weighted_var(official_public, var)
results_2.append(row)

# Individual public: press conferences + speeches
individual_public = df[df['com_type'].isin(['presscon', 'speeches'])]
row = {'Group': 'Individual Public'}
for var in summary_vars:
    row[f'{var}_mean'] = compute_year_weighted_mean(individual_public, var)
    row[f'{var}_var'] = compute_year_weighted_var(individual_public, var)
results_2.append(row)

# Private: transcripts
private = df[df['com_type'] == 'transcripts']
row = {'Group': 'Private'}
for var in summary_vars:
    row[f'{var}_mean'] = compute_year_weighted_mean(private, var)
    row[f'{var}_var'] = compute_year_weighted_var(private, var)
results_2.append(row)

results_2_df = pd.DataFrame(results_2)

# Create transposed LaTeX table for MEANS
latex_2_means = "\\begin{table}[htbp]\n\\centering\n\\small\n"
latex_2_means += "\\caption{Means: Public Statements vs Private (Year-Weighted)}\n"
latex_2_means += "\\label{tab:public_private_means}\n"
latex_2_means += "\\begin{tabular}{l" + "r" * len(results_2_df) + "}\n"
latex_2_means += "\\hline\\hline\n"

# Header
header_2 = "Variable"
for group in results_2_df['Group']:
    header_2 += f" & {group}"
header_2 += " \\\\\n"
latex_2_means += header_2
latex_2_means += "\\hline\n"

# Data rows
for var in summary_vars:
    clean_name = clean_variable_name(var)

    line = clean_name
    for _, row in results_2_df.iterrows():
        mean_val = row[f'{var}_mean']
        # Multiply emphasis variables by 100 for display
        if 'emphasis' in var:
            mean_val = mean_val * 100
        is_pct = var not in ['total_sentences']
        decimals = 1 if is_pct else 0
        line += f" & {mean_val:.{decimals}f}"
    line += " \\\\\n"
    latex_2_means += line

latex_2_means += "\\hline\\hline\n"
latex_2_means += "\\end{tabular}\n"
latex_2_means += "\\end{table}\n"

# Create transposed LaTeX table for VARIANCES
if INCLUDE_VARIANCE:
    latex_2_var = "\\begin{table}[htbp]\n\\centering\n\\small\n"
    latex_2_var += "\\caption{Variances: Public Statements vs Private (Year-Weighted)}\n"
    latex_2_var += "\\label{tab:public_private_var}\n"
    latex_2_var += "\\begin{tabular}{l" + "r" * len(results_2_df) + "}\n"
    latex_2_var += "\\hline\\hline\n"
    latex_2_var += header_2
    latex_2_var += "\\hline\n"

    for var in summary_vars:
        clean_name = clean_variable_name(var)

        line = clean_name
        for _, row in results_2_df.iterrows():
            var_val = row[f'{var}_var']
            line += f" & {var_val:.2f}"
        line += " \\\\\n"
        latex_2_var += line

    latex_2_var += "\\hline\\hline\n"
    latex_2_var += "\\end{tabular}\n"
    latex_2_var += "\\end{table}\n"

# Save tables
with open(f'{OUTPUT_DIR}/table2_public_private_means.tex', 'w') as f:
    f.write(latex_2_means)

if INCLUDE_VARIANCE:
    with open(f'{OUTPUT_DIR}/table2_public_private_var.tex', 'w') as f:
        f.write(latex_2_var)

print("✓ Table 2 saved (means and variances)")

# ============================================================================
# TABLE 3: CHAIR PUBLIC VS PRIVATE (YEAR-WEIGHTED)
# ============================================================================

print("\n" + "="*70)
print("TABLE 3: CHAIR PUBLIC VS PRIVATE (YEAR-WEIGHTED)")
print("="*70)

results_3 = []

# Chair in transcripts
chair_transcripts = df[(df['role'] == 'Chair') & (df['com_type'] == 'transcripts')]
row = {'Group': 'Chair Private'}
for var in summary_vars:
    row[f'{var}_mean'] = compute_year_weighted_mean(chair_transcripts, var)
    row[f'{var}_var'] = compute_year_weighted_var(chair_transcripts, var)
results_3.append(row)

# Chair in speeches and press conferences
chair_public = df[(df['role'] == 'Chair') & (df['com_type'].isin(['speeches', 'presscon']))]
row = {'Group': 'Chair Public'}
for var in summary_vars:
    row[f'{var}_mean'] = compute_year_weighted_mean(chair_public, var)
    row[f'{var}_var'] = compute_year_weighted_var(chair_public, var)
results_3.append(row)

results_3_df = pd.DataFrame(results_3)

# Create transposed LaTeX table for MEANS
latex_3_means = "\\begin{table}[htbp]\n\\centering\n\\small\n"
latex_3_means += "\\caption{Means: Chair Public vs Private Communications (Year-Weighted)}\n"
latex_3_means += "\\label{tab:chair_public_private_means}\n"
latex_3_means += "\\begin{tabular}{l" + "r" * len(results_3_df) + "}\n"
latex_3_means += "\\hline\\hline\n"

# Header
header_3 = "Variable"
for group in results_3_df['Group']:
    header_3 += f" & {group}"
header_3 += " \\\\\n"
latex_3_means += header_3
latex_3_means += "\\hline\n"

# Data rows
for var in summary_vars:
    clean_name = clean_variable_name(var)

    line = clean_name
    for _, row in results_3_df.iterrows():
        mean_val = row[f'{var}_mean']
        # Multiply emphasis variables by 100 for display
        if 'emphasis' in var:
            mean_val = mean_val * 100
        is_pct = var not in ['total_sentences']
        decimals = 1 if is_pct else 0
        line += f" & {mean_val:.{decimals}f}"
    line += " \\\\\n"
    latex_3_means += line

latex_3_means += "\\hline\\hline\n"
latex_3_means += "\\end{tabular}\n"
latex_3_means += "\\end{table}\n"

# Create transposed LaTeX table for VARIANCES
if INCLUDE_VARIANCE:
    latex_3_var = "\\begin{table}[htbp]\n\\centering\n\\small\n"
    latex_3_var += "\\caption{Variances: Chair Public vs Private Communications (Year-Weighted)}\n"
    latex_3_var += "\\label{tab:chair_public_private_var}\n"
    latex_3_var += "\\begin{tabular}{l" + "r" * len(results_3_df) + "}\n"
    latex_3_var += "\\hline\\hline\n"
    latex_3_var += header_3
    latex_3_var += "\\hline\n"

    for var in summary_vars:
        clean_name = clean_variable_name(var)

        line = clean_name
        for _, row in results_3_df.iterrows():
            var_val = row[f'{var}_var']
            line += f" & {var_val:.2f}"
        line += " \\\\\n"
        latex_3_var += line

    latex_3_var += "\\hline\\hline\n"
    latex_3_var += "\\end{tabular}\n"
    latex_3_var += "\\end{table}\n"

# Save tables
with open(f'{OUTPUT_DIR}/table3_chair_public_private_means.tex', 'w') as f:
    f.write(latex_3_means)

if INCLUDE_VARIANCE:
    with open(f'{OUTPUT_DIR}/table3_chair_public_private_var.tex', 'w') as f:
        f.write(latex_3_var)

print("✓ Table 3 saved (means and variances)")

# ============================================================================
# TABLE 4: REGIONAL BANKS VS GOVERNORS
# ============================================================================

print("\n" + "="*70)
print("TABLE 4: REGIONAL BANKS VS GOVERNORS (TRANSCRIPTS ONLY, YEAR-WEIGHTED)")
print("="*70)

transcripts = df[df['com_type'] == 'transcripts']

results_4 = []

# Governors
governors = transcripts[transcripts['role'] == 'Governor']
row = {'Group': 'Governors'}
for var in summary_vars:
    row[f'{var}_mean'] = compute_year_weighted_mean(governors, var)
    row[f'{var}_var'] = compute_year_weighted_var(governors, var)
results_4.append(row)

# Regional Presidents
regional = transcripts[transcripts['role'] == 'Regional President']
row = {'Group': 'Regional Presidents'}
for var in summary_vars:
    row[f'{var}_mean'] = compute_year_weighted_mean(regional, var)
    row[f'{var}_var'] = compute_year_weighted_var(regional, var)
results_4.append(row)

results_4_df = pd.DataFrame(results_4)

# Create transposed LaTeX table for MEANS
latex_4_means = "\\begin{table}[htbp]\n\\centering\n\\small\n"
latex_4_means += "\\caption{Means: Regional Banks vs Governors (Transcripts Only, Year-Weighted)}\n"
latex_4_means += "\\label{tab:regional_governors_means}\n"
latex_4_means += "\\begin{tabular}{l" + "r" * len(results_4_df) + "}\n"
latex_4_means += "\\hline\\hline\n"

# Header
header_4 = "Variable"
for group in results_4_df['Group']:
    header_4 += f" & {group}"
header_4 += " \\\\\n"
latex_4_means += header_4
latex_4_means += "\\hline\n"

# Data rows
for var in summary_vars:
    clean_name = clean_variable_name(var)

    line = clean_name
    for _, row in results_4_df.iterrows():
        mean_val = row[f'{var}_mean']
        # Multiply emphasis variables by 100 for display
        if 'emphasis' in var:
            mean_val = mean_val * 100
        is_pct = var not in ['total_sentences']
        decimals = 1 if is_pct else 0
        line += f" & {mean_val:.{decimals}f}"
    line += " \\\\\n"
    latex_4_means += line

latex_4_means += "\\hline\\hline\n"
latex_4_means += "\\end{tabular}\n"
latex_4_means += "\\end{table}\n"

# Create transposed LaTeX table for VARIANCES (includes within-bank variance)
if INCLUDE_VARIANCE:
    # Compute variance within regional banks
    regional_banks = regional['institution'].unique()
    regional_banks = [rb for rb in regional_banks if rb != '.']

    results_4_var = []

    # Add variance for governors
    row_var = {'Group': 'Governors'}
    for var in summary_vars:
        row_var[f'{var}_var'] = compute_year_weighted_var(governors, var)
    results_4_var.append(row_var)

    # Add variance for regional presidents
    row_var = {'Group': 'Regional Presidents'}
    for var in summary_vars:
        row_var[f'{var}_var'] = compute_year_weighted_var(regional, var)
    results_4_var.append(row_var)

    # Add variance of bank-level means
    row_var = {'Group': 'Var of Bank Means'}
    for var in summary_vars:
        bank_means = []
        for bank in regional_banks:
            bank_subset = regional[regional['institution'] == bank]
            if len(bank_subset) > 0:
                bank_means.append(compute_year_weighted_mean(bank_subset, var))

        if len(bank_means) > 0:
            row_var[f'{var}_var'] = np.var(bank_means)
        else:
            row_var[f'{var}_var'] = np.nan
    results_4_var.append(row_var)

    results_4_var_df = pd.DataFrame(results_4_var)

    latex_4_var = "\\begin{table}[htbp]\n\\centering\n\\small\n"
    latex_4_var += "\\caption{Variances: Regional Banks vs Governors (Transcripts Only, Year-Weighted)}\n"
    latex_4_var += "\\label{tab:regional_governors_var}\n"
    latex_4_var += "\\begin{tabular}{l" + "r" * len(results_4_var_df) + "}\n"
    latex_4_var += "\\hline\\hline\n"

    # Header
    header_4_var = "Variable"
    for group in results_4_var_df['Group']:
        header_4_var += f" & {group}"
    header_4_var += " \\\\\n"
    latex_4_var += header_4_var
    latex_4_var += "\\hline\n"

    for var in summary_vars:
        clean_name = clean_variable_name(var)

        line = clean_name
        for _, row in results_4_var_df.iterrows():
            var_val = row[f'{var}_var']
            if pd.isna(var_val):
                line += " & ---"
            else:
                line += f" & {var_val:.2f}"
        line += " \\\\\n"
        latex_4_var += line

    latex_4_var += "\\hline\\hline\n"
    latex_4_var += "\\end{tabular}\n"
    latex_4_var += "\\begin{minipage}{\\textwidth}\n"
    latex_4_var += "\\small\n"
    latex_4_var += "Note: Last column shows variance of bank-level means.\n"
    latex_4_var += "\\end{minipage}\n"
    latex_4_var += "\\end{table}\n"

# Save tables
with open(f'{OUTPUT_DIR}/table4_regional_governors_means.tex', 'w') as f:
    f.write(latex_4_means)

if INCLUDE_VARIANCE:
    with open(f'{OUTPUT_DIR}/table4_regional_governors_var.tex', 'w') as f:
        f.write(latex_4_var)

print("✓ Table 4 saved (means and variances)")

# ============================================================================
# TABLE 5: NOMINATING PRESIDENT
# ============================================================================

print("\n" + "="*70)
print("TABLE 5: GOVERNORS BY NOMINATING PRESIDENT (YEAR-WEIGHTED)")
print("="*70)

# Governors only, speeches and transcripts
gov_data = df[(df['role'] == 'Governor') &
              (df['com_type'].isin(['speeches', 'transcripts'])) &
              (df['president'] != '.')]

results_5 = []

presidents = sorted(gov_data['president'].unique())

for president in presidents:
    subset = gov_data[gov_data['president'] == president]

    row = {'Group': president}
    for var in summary_vars:
        row[f'{var}_mean'] = compute_year_weighted_mean(subset, var)
        row[f'{var}_var'] = compute_year_weighted_var(subset, var)
    results_5.append(row)

results_5_df = pd.DataFrame(results_5)

# Create transposed LaTeX table for MEANS
latex_5_means = "\\begin{table}[htbp]\n\\centering\n\\small\n"
latex_5_means += "\\caption{Means: Governors by Nominating President (Speeches and Transcripts, Year-Weighted)}\n"
latex_5_means += "\\label{tab:nominating_president_means}\n"
latex_5_means += "\\begin{tabular}{l" + "r" * len(results_5_df) + "}\n"
latex_5_means += "\\hline\\hline\n"

# Header
header_5 = "Variable"
for group in results_5_df['Group']:
    header_5 += f" & {group}"
header_5 += " \\\\\n"
latex_5_means += header_5
latex_5_means += "\\hline\n"

# Data rows
for var in summary_vars:
    clean_name = clean_variable_name(var)

    line = clean_name
    for _, row in results_5_df.iterrows():
        mean_val = row[f'{var}_mean']
        # Multiply emphasis variables by 100 for display
        if 'emphasis' in var:
            mean_val = mean_val * 100
        is_pct = var not in ['total_sentences']
        decimals = 1 if is_pct else 0
        line += f" & {mean_val:.{decimals}f}"
    line += " \\\\\n"
    latex_5_means += line

latex_5_means += "\\hline\\hline\n"
latex_5_means += "\\end{tabular}\n"
latex_5_means += "\\end{table}\n"

# Create transposed LaTeX table for VARIANCES
if INCLUDE_VARIANCE:
    latex_5_var = "\\begin{table}[htbp]\n\\centering\n\\small\n"
    latex_5_var += "\\caption{Variances: Governors by Nominating President (Speeches and Transcripts, Year-Weighted)}\n"
    latex_5_var += "\\label{tab:nominating_president_var}\n"
    latex_5_var += "\\begin{tabular}{l" + "r" * len(results_5_df) + "}\n"
    latex_5_var += "\\hline\\hline\n"
    latex_5_var += header_5
    latex_5_var += "\\hline\n"

    for var in summary_vars:
        clean_name = clean_variable_name(var)

        line = clean_name
        for _, row in results_5_df.iterrows():
            var_val = row[f'{var}_var']
            line += f" & {var_val:.2f}"
        line += " \\\\\n"
        latex_5_var += line

    latex_5_var += "\\hline\\hline\n"
    latex_5_var += "\\end{tabular}\n"
    latex_5_var += "\\end{table}\n"

# Save tables
with open(f'{OUTPUT_DIR}/table5_nominating_president_means.tex', 'w') as f:
    f.write(latex_5_means)

if INCLUDE_VARIANCE:
    with open(f'{OUTPUT_DIR}/table5_nominating_president_var.tex', 'w') as f:
        f.write(latex_5_var)

print("✓ Table 5 saved (means and variances)")

# ============================================================================
# TABLE 6: REGIONAL BANKS
# ============================================================================

print("\n" + "="*70)
print("TABLE 6: REGIONAL BANK PRESIDENTS (TRANSCRIPTS, YEAR-WEIGHTED)")
print("="*70)

# Regional presidents in transcripts
regional_transcripts = df[(df['com_type'] == 'transcripts') &
                          (df['role'] == 'Regional President') &
                          (df['institution'] != '.')]

results_6 = []

banks = sorted(regional_transcripts['institution'].unique())

for bank in banks:
    subset = regional_transcripts[regional_transcripts['institution'] == bank]

    row = {'Group': bank}
    for var in summary_vars:
        row[f'{var}_mean'] = compute_year_weighted_mean(subset, var)
        row[f'{var}_var'] = compute_year_weighted_var(subset, var)
    results_6.append(row)

results_6_df = pd.DataFrame(results_6)

# Create transposed LaTeX table for MEANS
latex_6_means = "\\begin{table}[htbp]\n\\centering\n\\tiny\n"
latex_6_means += "\\caption{Means: Regional Bank Presidents (Transcripts Only, Year-Weighted)}\n"
latex_6_means += "\\label{tab:regional_banks_means}\n"
latex_6_means += "\\begin{tabular}{l" + "r" * len(results_6_df) + "}\n"
latex_6_means += "\\hline\\hline\n"

# Header (abbreviated bank names to fit)
header_6 = "Variable"
for group in results_6_df['Group']:
    # Abbreviate bank names for table width
    abbrev = group.replace('St. Louis', 'STL').replace('Kansas City', 'KC').replace('San Francisco', 'SF')
    header_6 += f" & {abbrev}"
header_6 += " \\\\\n"
latex_6_means += header_6
latex_6_means += "\\hline\n"

# Data rows
for var in summary_vars:
    clean_name = clean_variable_name(var)

    line = clean_name
    for _, row in results_6_df.iterrows():
        mean_val = row[f'{var}_mean']
        # Multiply emphasis variables by 100 for display
        if 'emphasis' in var:
            mean_val = mean_val * 100
        is_pct = var not in ['total_sentences']
        decimals = 1 if is_pct else 0
        line += f" & {mean_val:.{decimals}f}"
    line += " \\\\\n"
    latex_6_means += line

latex_6_means += "\\hline\\hline\n"
latex_6_means += "\\end{tabular}\n"
latex_6_means += "\\end{table}\n"

# Create transposed LaTeX table for VARIANCES
if INCLUDE_VARIANCE:
    latex_6_var = "\\begin{table}[htbp]\n\\centering\n\\tiny\n"
    latex_6_var += "\\caption{Variances: Regional Bank Presidents (Transcripts Only, Year-Weighted)}\n"
    latex_6_var += "\\label{tab:regional_banks_var}\n"
    latex_6_var += "\\begin{tabular}{l" + "r" * len(results_6_df) + "}\n"
    latex_6_var += "\\hline\\hline\n"
    latex_6_var += header_6
    latex_6_var += "\\hline\n"

    for var in summary_vars:
        clean_name = clean_variable_name(var)

        line = clean_name
        for _, row in results_6_df.iterrows():
            var_val = row[f'{var}_var']
            line += f" & {var_val:.2f}"
        line += " \\\\\n"
        latex_6_var += line

    latex_6_var += "\\hline\\hline\n"
    latex_6_var += "\\end{tabular}\n"
    latex_6_var += "\\end{table}\n"

# Save tables
with open(f'{OUTPUT_DIR}/table6_regional_banks_means.tex', 'w') as f:
    f.write(latex_6_means)

if INCLUDE_VARIANCE:
    with open(f'{OUTPUT_DIR}/table6_regional_banks_var.tex', 'w') as f:
        f.write(latex_6_var)

print("✓ Table 6 saved (means and variances)")

# ============================================================================
# FIGURE 7: TIME SERIES BY YEAR - TOTAL SENTENCES
# ============================================================================

print("\n" + "="*70)
print("FIGURE 7: TIME SERIES GRAPHS")
print("="*70)

# Prepare yearly aggregates
yearly_data = df.groupby(['year', 'com_type']).agg({
    'total_sentences': 'sum',
    'sentences_on_labor': 'sum',
    'sentences_on_inflation': 'sum'
}).reset_index()

# Compute shares
yearly_data['labor_share'] = np.where(
    yearly_data['total_sentences'] > 0,
    100 * yearly_data['sentences_on_labor'] / yearly_data['total_sentences'],
    0
)

yearly_data['inflation_share'] = np.where(
    yearly_data['total_sentences'] > 0,
    100 * yearly_data['sentences_on_inflation'] / yearly_data['total_sentences'],
    0
)

# Set up plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Plot 7a: Total sentences
fig, ax = plt.subplots(figsize=(12, 6))

for com_type in ['statements', 'minutes', 'transcripts', 'speeches', 'presscon']:
    subset = yearly_data[yearly_data['com_type'] == com_type]
    if len(subset) > 0:
        ax.plot(subset['year'], subset['total_sentences'],
                marker='o', linewidth=2, label=com_type.capitalize())

ax.set_xlabel('Year', fontsize=12)
ax.set_ylabel('Total Sentences', fontsize=12)
ax.set_title('Total Sentences by Communication Type (Annual)', fontsize=14, fontweight='bold')
ax.legend(loc='best', frameon=True)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/figure7a_total_sentences.png', dpi=300, bbox_inches='tight')
plt.close()

print("✓ Figure 7a: Total sentences saved")

# Plot 7b: Labor share
fig, ax = plt.subplots(figsize=(12, 6))

for com_type in ['statements', 'minutes', 'transcripts', 'speeches', 'presscon']:
    subset = yearly_data[yearly_data['com_type'] == com_type]
    if len(subset) > 0:
        ax.plot(subset['year'], subset['labor_share'],
                marker='o', linewidth=2, label=com_type.capitalize())

ax.set_xlabel('Year', fontsize=12)
ax.set_ylabel('Labor Share (%)', fontsize=12)
ax.set_title('Labor Share of Sentences by Communication Type (Annual)', fontsize=14, fontweight='bold')
ax.legend(loc='best', frameon=True)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/figure7b_labor_share.png', dpi=300, bbox_inches='tight')
plt.close()

print("✓ Figure 7b: Labor share saved")

# Plot 7c: Inflation share
fig, ax = plt.subplots(figsize=(12, 6))

for com_type in ['statements', 'minutes', 'transcripts', 'speeches', 'presscon']:
    subset = yearly_data[yearly_data['com_type'] == com_type]
    if len(subset) > 0:
        ax.plot(subset['year'], subset['inflation_share'],
                marker='o', linewidth=2, label=com_type.capitalize())

ax.set_xlabel('Year', fontsize=12)
ax.set_ylabel('Inflation Share (%)', fontsize=12)
ax.set_title('Inflation Share of Sentences by Communication Type (Annual)', fontsize=14, fontweight='bold')
ax.legend(loc='best', frameon=True)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/figure7c_inflation_share.png', dpi=300, bbox_inches='tight')
plt.close()

print("✓ Figure 7c: Inflation share saved")

# ============================================================================
# FIGURE 8: QUARTERLY VARIANCE OF EUCLIDEAN DISTANCES
# ============================================================================

print("\n" + "="*70)
print("FIGURE 8: QUARTERLY VARIANCE OF EUCLIDEAN DISTANCES")
print("="*70)

# Function to compute quarterly variance for a specific dataset
def compute_quarterly_variance(data, emphasis_vars, var_type='labor'):
    """Compute variance of Euclidean distances by quarter"""

    quarterly_variance = []

    for quarter in sorted(data['quarter'].dropna().unique()):
        quarter_data = data[data['quarter'] == quarter]

        if len(quarter_data) > 1:  # Need at least 2 observations
            variance, _, _ = compute_euclidean_distance_variance(
                quarter_data, emphasis_vars
            )
            quarterly_variance.append({
                'quarter': quarter,
                'variance': variance,
                'n_obs': len(quarter_data)
            })

    return pd.DataFrame(quarterly_variance)

def apply_rolling_average(df, window=3):
    """Apply rolling average to variance column"""
    df = df.copy()
    df['variance_smoothed'] = df['variance'].rolling(window=window, center=False, min_periods=1).mean()
    return df

# Transcripts
transcripts_df = df[df['com_type'] == 'transcripts'].copy()
transcripts_df = transcripts_df.dropna(subset=['quarter'])

# Compute for labor emphasis
labor_var_transcripts = compute_quarterly_variance(
    transcripts_df, LABOR_EMPHASIS_VARS, 'labor'
)
labor_var_transcripts = apply_rolling_average(labor_var_transcripts, window=3)

# Compute for inflation emphasis
inflation_var_transcripts = compute_quarterly_variance(
    transcripts_df, INFLATION_EMPHASIS_VARS, 'inflation'
)
inflation_var_transcripts = apply_rolling_average(inflation_var_transcripts, window=3)

# Speeches
speeches_df = df[df['com_type'] == 'speeches'].copy()
speeches_df = speeches_df.dropna(subset=['quarter'])

labor_var_speeches = compute_quarterly_variance(
    speeches_df, LABOR_EMPHASIS_VARS, 'labor'
)
labor_var_speeches = apply_rolling_average(labor_var_speeches, window=3)

inflation_var_speeches = compute_quarterly_variance(
    speeches_df, INFLATION_EMPHASIS_VARS, 'inflation'
)
inflation_var_speeches = apply_rolling_average(inflation_var_speeches, window=3)

# Plot 8a: Labor emphasis variance (3-quarter rolling average)
fig, ax = plt.subplots(figsize=(12, 6))

if len(labor_var_transcripts) > 0:
    quarters_t = pd.PeriodIndex(labor_var_transcripts['quarter']).to_timestamp()
    ax.plot(quarters_t, labor_var_transcripts['variance_smoothed'],
            marker='o', linewidth=2, label='Transcripts', alpha=0.8)

if len(labor_var_speeches) > 0:
    quarters_s = pd.PeriodIndex(labor_var_speeches['quarter']).to_timestamp()
    ax.plot(quarters_s, labor_var_speeches['variance_smoothed'],
            marker='s', linewidth=2, label='Speeches', alpha=0.8)

ax.set_xlabel('Quarter', fontsize=12)
ax.set_ylabel('Variance of Euclidean Distances (3-Quarter MA)', fontsize=12)
ax.set_title('Variance in Labor Emphasis Vectors (3-Quarter Rolling Average)', fontsize=14, fontweight='bold')
ax.legend(loc='best', frameon=True)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/figure8a_labor_variance.png', dpi=300, bbox_inches='tight')
plt.close()

print("✓ Figure 8a: Labor emphasis variance saved")

# Plot 8b: Inflation emphasis variance (3-quarter rolling average)
fig, ax = plt.subplots(figsize=(12, 6))

if len(inflation_var_transcripts) > 0:
    quarters_t = pd.PeriodIndex(inflation_var_transcripts['quarter']).to_timestamp()
    ax.plot(quarters_t, inflation_var_transcripts['variance_smoothed'],
            marker='o', linewidth=2, label='Transcripts', alpha=0.8)

if len(inflation_var_speeches) > 0:
    quarters_s = pd.PeriodIndex(inflation_var_speeches['quarter']).to_timestamp()
    ax.plot(quarters_s, inflation_var_speeches['variance_smoothed'],
            marker='s', linewidth=2, label='Speeches', alpha=0.8)

ax.set_xlabel('Quarter', fontsize=12)
ax.set_ylabel('Variance of Euclidean Distances (3-Quarter MA)', fontsize=12)
ax.set_title('Variance in Inflation Emphasis Vectors (3-Quarter Rolling Average)', fontsize=14, fontweight='bold')
ax.legend(loc='best', frameon=True)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/figure8b_inflation_variance.png', dpi=300, bbox_inches='tight')
plt.close()

print("✓ Figure 8b: Inflation emphasis variance saved")

# ============================================================================
# SUMMARY
# ============================================================================

print("\n" + "="*70)
print("ANALYSIS COMPLETE!")
print("="*70)
print(f"\nAll outputs saved to: {OUTPUT_DIR}")
print("\nFiles created:")
print("  Tables (LaTeX) - Means:")
print("    - table1_comm_types_means.tex")
print("    - table1a_comm_types_2006_2019_means.tex")
print("    - table2_public_private_means.tex")
print("    - table3_chair_public_private_means.tex")
print("    - table4_regional_governors_means.tex")
print("    - table5_nominating_president_means.tex")
print("    - table6_regional_banks_means.tex")
if INCLUDE_VARIANCE:
    print("\n  Tables (LaTeX) - Variances:")
    print("    - table1_comm_types_var.tex")
    print("    - table1a_comm_types_2006_2019_var.tex")
    print("    - table2_public_private_var.tex")
    print("    - table3_chair_public_private_var.tex")
    print("    - table4_regional_governors_var.tex")
    print("    - table5_nominating_president_var.tex")
    print("    - table6_regional_banks_var.tex")
print("\n  Figures (PNG):")
print("    - figure7a_total_sentences.png")
print("    - figure7b_labor_share.png")
print("    - figure7c_inflation_share.png")
print("    - figure8a_labor_variance.png (3-quarter rolling average)")
print("    - figure8b_inflation_variance.png (3-quarter rolling average)")
print("="*70)

Mounted at /content/drive

LOADING DATA
Loaded 4277 records

Date range: 2000-02-02 00:00:00 to 2025-09-23 00:00:00
Records by communication type:
com_type
transcripts    2589
speeches       1121
minutes         199
statements      198
presscon        170
Name: count, dtype: int64

FILTERING DATA: DROPPING ROWS WITH NO LABOR/INFLATION MENTIONS

Original rows: 4277
Rows dropped: 310
Remaining rows: 3967

Remaining records by communication type:
com_type
transcripts    2514
speeches        909
minutes         199
statements      175
presscon        170
Name: count, dtype: int64

TABLE 1: AVERAGES ACROSS COMMUNICATION TYPES (YEAR-WEIGHTED)
✓ Table 1 saved (year-weighted means and variances)

TABLE 1A: AVERAGES 2006-2019 (EXCLUDING PRESS CONFERENCES, YEAR-WEIGHTED)
✓ Table 1a saved (means and variances)

TABLE 2: PUBLIC STATEMENTS VS PRIVATE (YEAR-WEIGHTED)
✓ Table 2 saved (means and variances)

TABLE 3: CHAIR PUBLIC VS PRIVATE (YEAR-WEIGHTED)
✓ Table 3 saved (means and variances)

TABLE 4

In [18]:

# ============================================================================
# VIZ BONUS: PCA SCATTER BY OFFICIAL
# ============================================================================

print("\n" + "="*70)
print("VIZ BONUS: PCA SCATTER BY OFFICIAL (TRANSCRIPTS ONLY)")
print("="*70)

def create_pca_by_official(df, emphasis_vars, labels, title_base, filename_base):
    """
    Create two PCA scatters with one dot per official:
    1. Colored by institution
    2. Colored by time period
    """

    # Determine which sum to use
    if 'labor' in emphasis_vars[0]:
        sum_col = 'labor_emphasis_sum'
    else:
        sum_col = 'inflation_emphasis_sum'

    # Data is already filtered to transcripts at the top level
    df_filtered = df[
        (df[sum_col] > 0) &
        (df['speaker'] != 'fomc') &
        (df['speaker'] != '.') &
        (df['speaker'].notna())
    ].copy()

    print(f"  Using transcripts data: {len(df_filtered)} communications")
    if len(df_filtered) > 0:
        print(f"  Date range: {df_filtered['date'].min()} to {df_filtered['date'].max()}")

    # Consolidate speaker codes for same person
    SPEAKER_CONSOLIDATION = {
        'bbernanke': 'Bernanke',
        'bsbernanke': 'Bernanke',
        'jpowell': 'Powell',
        'jhpowell': 'Powell',
        'agreenspan': 'Greenspan',
        'jyellen': 'Yellen',
    }

    df_filtered['speaker_consolidated'] = df_filtered['speaker'].apply(
        lambda x: SPEAKER_CONSOLIDATION.get(str(x).lower(), str(x))
    )

    print(f"  Original unique speakers: {df_filtered['speaker'].nunique()}")
    print(f"  Consolidated unique speakers: {df_filtered['speaker_consolidated'].nunique()}")

    if len(df_filtered) < 10:
        print(f"⚠ Not enough data for PCA by official ({len(df_filtered)} rows)")
        return

    # Aggregate by consolidated speaker
    official_profiles = df_filtered.groupby('speaker_consolidated')[emphasis_vars].mean()

    # Also get institution, role, and average date
    official_info = df_filtered.groupby('speaker_consolidated').agg({
        'institution': lambda x: x.mode()[0] if len(x.mode()) > 0 else x.iloc[0],
        'role': lambda x: x.mode()[0] if len(x.mode()) > 0 else x.iloc[0],
        'date': 'mean'
    })

    official_profiles = official_profiles.join(official_info)

    # Remove officials with too few observations
    official_counts = df_filtered.groupby('speaker_consolidated').size()
    valid_officials = official_counts[official_counts >= 3].index
    official_profiles = official_profiles[official_profiles.index.isin(valid_officials)]

    if len(official_profiles) < 10:
        print(f"⚠ Not enough officials for PCA ({len(official_profiles)} officials)")
        return

    print(f"✓ Aggregated {len(official_profiles)} unique officials")

    # Run PCA
    X = official_profiles[emphasis_vars].values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)

    institutions = official_profiles['institution'].values
    roles = official_profiles['role'].values
    speakers = official_profiles.index.values
    dates = official_profiles['date'].values
    years = pd.to_datetime(dates).year

    # ========================================================================
    # PLOT 1: COLORED BY INSTITUTION
    # ========================================================================

    fig, ax = plt.subplots(figsize=(14, 10))

    institution_colors = {
        'Board': '#1f77b4',
        'Boston': '#ff7f0e',
        'New York': '#2ca02c',
        'Philadelphia': '#d62728',
        'Cleveland': '#9467bd',
        'Richmond': '#8c564b',
        'Atlanta': '#e377c2',
        'Chicago': '#7f7f7f',
        'St. Louis': '#bcbd22',
        'Minneapolis': '#17becf',
        'Kansas City': '#ff9896',
        'Dallas': '#c5b0d5',
        'San Francisco': '#c49c94'
    }

    role_markers = {
        'Chair': '*',
        'Governor': 'o',
        'Regional President': 's',
        'Vice Chair': 'D'
    }

    institutions_plotted = set()

    for i, (institution, role, speaker) in enumerate(zip(institutions, roles, speakers)):
        color = institution_colors.get(institution, 'gray')
        marker = role_markers.get(role, 'o')
        size = 400 if role == 'Chair' else 150 if role == 'Governor' else 120

        label = institution if institution not in institutions_plotted else ""
        if label:
            institutions_plotted.add(institution)

        ax.scatter(
            X_pca[i, 0],
            X_pca[i, 1],
            color=color,
            marker=marker,
            s=size,
            alpha=0.7,
            edgecolors='black',
            linewidth=2.0 if role == 'Chair' else 1.0,
            label=label,
            zorder=10 if role == 'Chair' else 5
        )

    # Add labels for Chairs
    for i, (institution, role, speaker) in enumerate(zip(institutions, roles, speakers)):
        if role == 'Chair':
            ax.annotate(
                speaker,
                (X_pca[i, 0], X_pca[i, 1]),
                xytext=(8, 8),
                textcoords='offset points',
                fontsize=11,
                fontweight='bold',
                bbox=dict(boxstyle='round,pad=0.4', facecolor='yellow', alpha=0.8,
                         edgecolor='black', linewidth=1.5)
            )

    ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)',
                  fontsize=13, fontweight='bold')
    ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)',
                  fontsize=13, fontweight='bold')
    ax.set_title(f'{title_base} - Colored by Institution\n(Transcripts Only)',
                 fontsize=15, fontweight='bold', pad=20)

    # Institution legend
    handles, legend_labels = ax.get_legend_handles_labels()
    institution_order = ['Board'] + sorted([inst for inst in institution_colors.keys() if inst != 'Board'])
    sorted_items = [(h, l) for l, h in sorted(zip(legend_labels, handles),
                    key=lambda x: institution_order.index(x[0]) if x[0] in institution_order else 999)]

    if sorted_items:
        sorted_handles, sorted_labels = zip(*sorted_items)
        legend1 = ax.legend(sorted_handles, sorted_labels,
                           title='Institution', loc='upper left',
                           fontsize=9, title_fontsize=10,
                           frameon=True, framealpha=0.95, edgecolor='black')
        ax.add_artist(legend1)

    # Role legend
    from matplotlib.lines import Line2D
    role_legend_elements = [
        Line2D([0], [0], marker='*', color='w', markerfacecolor='gray',
               markersize=18, label='Chair', markeredgecolor='black', markeredgewidth=2),
        Line2D([0], [0], marker='o', color='w', markerfacecolor='gray',
               markersize=11, label='Governor', markeredgecolor='black', markeredgewidth=1),
        Line2D([0], [0], marker='s', color='w', markerfacecolor='gray',
               markersize=10, label='Regional President', markeredgecolor='black', markeredgewidth=1)
    ]
    legend2 = ax.legend(handles=role_legend_elements, title='Role',
                       loc='upper right', fontsize=9, title_fontsize=10,
                       frameon=True, framealpha=0.95, edgecolor='black')

    ax.grid(True, alpha=0.3, linestyle='--')

    total_var = pca.explained_variance_ratio_[:2].sum()
    info_text = f'Total Variance: {total_var*100:.1f}%\nN Officials: {len(official_profiles)}\nOne dot per official\nTranscripts only'
    ax.text(0.02, 0.02, info_text, transform=ax.transAxes, fontsize=10,
            verticalalignment='bottom',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.9,
                     edgecolor='black', linewidth=1))

    plt.tight_layout()
    filename_inst = f"{filename_base}_by_institution.png"
    plt.savefig(f'{VIZ_DIR}/{filename_inst}', dpi=300, bbox_inches='tight')
    plt.close()

    print(f"✓ Saved {filename_inst}")

    # ========================================================================
    # PLOT 2: COLORED BY TIME PERIOD
    # ========================================================================

    fig, ax = plt.subplots(figsize=(14, 10))

    from matplotlib import cm
    norm = plt.Normalize(vmin=years.min(), vmax=years.max())
    cmap = cm.get_cmap('viridis')

    for i, (year, role, speaker) in enumerate(zip(years, roles, speakers)):
        color = cmap(norm(year))
        marker = role_markers.get(role, 'o')
        size = 400 if role == 'Chair' else 150 if role == 'Governor' else 120

        ax.scatter(
            X_pca[i, 0],
            X_pca[i, 1],
            color=color,
            marker=marker,
            s=size,
            alpha=0.7,
            edgecolors='black',
            linewidth=2.0 if role == 'Chair' else 1.0,
            zorder=10 if role == 'Chair' else 5
        )

    # Add labels for Chairs
    for i, (year, role, speaker) in enumerate(zip(years, roles, speakers)):
        if role == 'Chair':
            ax.annotate(
                speaker,
                (X_pca[i, 0], X_pca[i, 1]),
                xytext=(8, 8),
                textcoords='offset points',
                fontsize=11,
                fontweight='bold',
                bbox=dict(boxstyle='round,pad=0.4', facecolor='yellow', alpha=0.8,
                         edgecolor='black', linewidth=1.5)
            )

    ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)',
                  fontsize=13, fontweight='bold')
    ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)',
                  fontsize=13, fontweight='bold')
    ax.set_title(f'{title_base} - Colored by Time Period\n(Transcripts Only)',
                 fontsize=15, fontweight='bold', pad=20)

    # Colorbar
    sm = cm.ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])
    cbar = plt.colorbar(sm, ax=ax, label='Year', pad=0.02)
    cbar.set_label('Year', fontsize=12, fontweight='bold')

    # Role legend
    legend_elements = [
        Line2D([0], [0], marker='*', color='w', markerfacecolor='gray',
               markersize=18, label='Chair', markeredgecolor='black', markeredgewidth=2),
        Line2D([0], [0], marker='o', color='w', markerfacecolor='gray',
               markersize=11, label='Governor', markeredgecolor='black', markeredgewidth=1),
        Line2D([0], [0], marker='s', color='w', markerfacecolor='gray',
               markersize=10, label='Regional President', markeredgecolor='black', markeredgewidth=1)
    ]
    ax.legend(handles=legend_elements, title='Role',
             loc='upper left', fontsize=9, title_fontsize=10,
             frameon=True, framealpha=0.95, edgecolor='black')

    ax.grid(True, alpha=0.3, linestyle='--')

    info_text = f'Total Variance: {total_var*100:.1f}%\nN Officials: {len(official_profiles)}\nOne dot per official\nTranscripts only'
    ax.text(0.02, 0.02, info_text, transform=ax.transAxes, fontsize=10,
            verticalalignment='bottom',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.9,
                     edgecolor='black', linewidth=1))

    plt.tight_layout()
    filename_time = f"{filename_base}_by_time.png"
    plt.savefig(f'{VIZ_DIR}/{filename_time}', dpi=300, bbox_inches='tight')
    plt.close()

    print(f"✓ Saved {filename_time}")
    print(f"  Explained variance: PC1={pca.explained_variance_ratio_[0]*100:.1f}%, PC2={pca.explained_variance_ratio_[1]*100:.1f}%")

# LABOR PCA BY OFFICIAL
create_pca_by_official(
    df,
    LABOR_EMPHASIS_VARS,
    LABOR_LABELS,
    'PCA of Labor Emphasis by Official',
    'viz_bonus_labor'
)

# INFLATION PCA BY OFFICIAL
create_pca_by_official(
    df,
    INFLATION_EMPHASIS_VARS,
    INFLATION_LABELS,
    'PCA of Inflation Emphasis by Official',
    'viz_bonus_inflation'
)

# ============================================================================
# SUMMARY
# ============================================================================

print("\n" + "="*70)
print("VISUALIZATION COMPLETE - TRANSCRIPTS ONLY VERSION!")
print("="*70)
print(f"\nAll visualizations saved to: {VIZ_DIR}")
print("\nFiles created:")
print("  VIZ 1 - Radar Charts:")
print("    - viz1a_radar_labor_by_chair.png")
print("    - viz1b_radar_inflation_by_chair.png")
print("\n  VIZ 2 - Stacked Area Charts:")
print("    - viz2a_stacked_area_labor.png")
print("    - viz2b_stacked_area_inflation.png")
print("\n  VIZ 3 - Board vs Regional Comparison:")
print("    - viz3a_institution_labor.png")
print("    - viz3b_institution_inflation.png")
print("\n  VIZ 4 - PCA Scatter by Chair:")
print("    - viz4a_pca_labor_by_chair.png")
print("    - viz4b_pca_inflation_by_chair.png")
print("\n  VIZ 5 - Event Studies:")
print("    - viz5a_event_study_labor.png")
print("    - viz5b_event_study_inflation.png")
print("\n  VIZ BONUS - PCA by Official:")
print("    - viz_bonus_labor_by_institution.png")
print("    - viz_bonus_labor_by_time.png")
print("    - viz_bonus_inflation_by_institution.png")
print("    - viz_bonus_inflation_by_time.png")
print("\n" + "="*70)
print("NOTE: All visualizations use ONLY transcripts data (~1993-2019)")
print("="*70)


VIZ BONUS: PCA SCATTER BY OFFICIAL (TRANSCRIPTS ONLY)
  Using transcripts data: 2414 communications
  Date range: 2000-02-02 00:00:00 to 2019-12-11 00:00:00
  Original unique speakers: 53
  Consolidated unique speakers: 53
✓ Aggregated 52 unique officials
✓ Saved viz_bonus_labor_by_institution.png
✓ Saved viz_bonus_labor_by_time.png
  Explained variance: PC1=30.1%, PC2=21.0%
  Using transcripts data: 2217 communications
  Date range: 2000-02-02 00:00:00 to 2019-12-11 00:00:00
  Original unique speakers: 53
  Consolidated unique speakers: 53
✓ Aggregated 51 unique officials
✓ Saved viz_bonus_inflation_by_institution.png
✓ Saved viz_bonus_inflation_by_time.png
  Explained variance: PC1=21.5%, PC2=13.5%

VISUALIZATION COMPLETE - TRANSCRIPTS ONLY VERSION!

All visualizations saved to: /content/drive/MyDrive/FedComs/SummaryStats/Visualizations_Transcripts_Only

Files created:
  VIZ 1 - Radar Charts:
    - viz1a_radar_labor_by_chair.png
    - viz1b_radar_inflation_by_chair.png

  VIZ 2 - St