In [42]:
"""
LLM Strategy Analysis: Robust Performance & Stability Metrics.

This analysis evaluates reasoning strategies using a multi-method approach 
to separate signal (performance) from noise (instability) and bias (task difficulty).

Methodology:
1.  **Performance (Bootstrap CIs):** 
    -   Uses Cluster Bootstrapping (resampling benchmarks) to generate 95% Confidence Intervals.
    -   Accounts for the uncertainty of specific task selection more robustly than simple means.
    
2.  **Stability (Z-Score Variance):** 
    -   Calculates the variance of standardized scores (Z-scores) per strategy.
    -   High Variance = "Maverick" (Unpredictable performance relative to task difficulty).
    -   Low Variance = "Conformist" (Predictable performance that tracks difficulty).
    
3.  **Strategy Clustering (Spearman Correlation):**
    -   Computes rank correlations to identify strategies that share similar success/failure patterns.
    
4.  **Significance Check (LMM):**
    -   Uses Linear Mixed-Effects Models as a parametric check for performance differences, 
        controlling for benchmark difficulty.
"""

'\nLLM Strategy Analysis: Robust Performance & Stability Metrics.\n\nThis analysis evaluates reasoning strategies using a multi-method approach \nto separate signal (performance) from noise (instability) and bias (task difficulty).\n\nMethodology:\n1.  **Performance (Bootstrap CIs):** \n    -   Uses Cluster Bootstrapping (resampling benchmarks) to generate 95% Confidence Intervals.\n    -   Accounts for the uncertainty of specific task selection more robustly than simple means.\n\n2.  **Stability (Z-Score Variance):** \n    -   Calculates the variance of standardized scores (Z-scores) per strategy.\n    -   High Variance = "Maverick" (Unpredictable performance relative to task difficulty).\n    -   Low Variance = "Conformist" (Predictable performance that tracks difficulty).\n\n3.  **Strategy Clustering (Spearman Correlation):**\n    -   Computes rank correlations to identify strategies that share similar success/failure patterns.\n\n4.  **Significance Check (LMM):**\n    -   Uses Line

In [43]:

import pandas as pd
import numpy as np
import statsmodels.formula.api as smf


In [44]:

def load_data(path: str) -> pd.DataFrame:
    """Load and preprocess data."""
    df = pd.read_parquet(path)
    
    # Aggregation: Mean score per run (if individual questions exist)
    if "scores" in df.columns and isinstance(df["scores"].iloc[0], (list, np.ndarray)):
         df["Score"] = df["scores"].apply(lambda x: np.mean(x))
    
    # Formatting
    df = df.rename(columns={"Method": "Strategy"})
    df["Strategy"] = df["Strategy"].astype("str")
    df["Benchmark"] = df["Benchmark"].astype("str")
    
    return df[["Strategy", "Benchmark", "Score"]]

df = load_data("gpt-4_1-nano.parquet")
print(f"Loaded {len(df)} runs across {df['Strategy'].nunique()} strategies.")


Loaded 479 runs across 8 strategies.


In [45]:

def fit_lmm_ranking(df: pd.DataFrame):
    """
    Metric 1: Ranking (The Signal).
    Fit LMM on RAW scores to get Difficulty-Adjusted Means.
    Model: Score ~ Strategy + (1|Benchmark)
    """
    print("Fitting LMM for Ranking (Raw Scores)...")
    
    # Random intercept for Benchmark absorbs task difficulty
    model = smf.mixedlm(
        "Score ~ C(Strategy)", 
        df, 
        groups="Benchmark", 
        re_formula="1"
    )
    result = model.fit(reml=True)
    
    # Use marginal means (predicted values) for the leaderboard
    # Simplest proxy is the raw mean, but LMM coefficients are better if detailed adjustment needed.
    # For this script, we'll align with the descriptive means for the table, 
    # but use the LMM printout for significance.
    return result

# 1. Ranking Analysis (LMM on Raw Scores)
lmm_result = fit_lmm_ranking(df)

Fitting LMM for Ranking (Raw Scores)...


In [46]:

def calculate_stability_metrics(df: pd.DataFrame):
    """
    Metric 2: Stability (The Noise).
    Calculate Variance of Z-Scores to measure inconsistency.
    """
    print("Calculating Z-Score Variance (Stability)...")
    
    # 1. Standardize (Z-Score) within each Benchmark
    # This removes "Difficulty" and "Scale" (Heteroscedasticity)
    def standardize(x):
        std = x.std()
        if std == 0:
            return np.zeros_like(x) # Invariant benchmark -> Neutral signal
        return (x - x.mean()) / std

    df["Z_Score"] = df.groupby("Benchmark")["Score"].transform(standardize)
    
    # 2. Calculate Variance of Z-scores per Strategy
    # Low Var: Strategy follows the crowd (Conformist)
    # High Var: Strategy is unpredictable relative to the field (Maverick)
    z_stats = df.groupby("Strategy")["Z_Score"].agg(["var", "mean", "count"])
    z_stats = z_stats.rename(columns={"var": "Stability_Noise", "mean": "Z_Bias"})
    
    return z_stats


# 2. Stability Analysis (Variance of Z-Scores)
stability_df = calculate_stability_metrics(df)

Calculating Z-Score Variance (Stability)...


In [47]:

def bootstrap_confidence_intervals(df: pd.DataFrame, n_bootstrap: int = 1000, 
                                 confidence: float = 0.95, seed: int = 42) -> pd.DataFrame:
    """
    Metric 3: Confidence Intervals (Cluster Bootstrap).
    Resamples BENCHMARKS (not runs) to account for task sampling uncertainty.
    """
    print(f"Bootstrapping Confidence Intervals (n={n_bootstrap})...")
    rng = np.random.default_rng(seed)
    strategies = sorted(df["Strategy"].unique())
    benchmarks = df["Benchmark"].unique()
    
    boot_means = {s: [] for s in strategies}
    
    for _ in range(n_bootstrap):
        # Cluster Bootstrap: Resample benchmarks with replacement
        boot_benchmarks = rng.choice(benchmarks, size=len(benchmarks), replace=True)
        
        # Build resampled dataset efficiently (using index mapping if possible, but loop is safely explicit)
        # To speed up: We can pre-calculate benchmark-level means per strategy, 
        # but full reconstruction handles missing data patterns correctly.
        boot_dfs = []
        for i, b in enumerate(boot_benchmarks):
            # We must assign a new unique benchmark ID (e.g., b_0, b_1) because
            # if we picked 'Benchmark A' twice, they are now distinct events in the bootstrap sample
            subset = df[df["Benchmark"] == b].copy()
            subset["Benchmark"] = f"{b}_{i}" 
            boot_dfs.append(subset)
            
        boot_df = pd.concat(boot_dfs)
        means = boot_df.groupby("Strategy")["Score"].mean()
        
        for s in strategies:
            boot_means[s].append(means.get(s, np.nan))
            
    # Compile CIs
    alpha = 1 - confidence
    results = []
    for s in strategies:
        samples = np.array(boot_means[s])
        samples = samples[~np.isnan(samples)]
        
        if len(samples) > 0:
            lower = np.percentile(samples, 100 * alpha / 2)
            upper = np.percentile(samples, 100 * (1 - alpha / 2))
            results.append({
                "Strategy": s,
                "CI_Lower": lower,
                "CI_Upper": upper,
                "CI_Formatted": f"[{lower:.2f}, {upper:.2f}]"
            })
            
    return pd.DataFrame(results).set_index("Strategy")


# 3. Confidence Intervals (Bootstrap)
ci_df = bootstrap_confidence_intervals(df)

Bootstrapping Confidence Intervals (n=1000)...


In [48]:

def analyze_correlations(df: pd.DataFrame):
    """Metric 3: Correlations (Clustering)."""
    pivot = df.pivot_table(index="Benchmark", columns="Strategy", values="Score", aggfunc="mean")
    return pivot.corr(method="spearman")


# 4. Compile Leaderboard
# Get raw means for the "Score" column
raw_means = df.groupby("Strategy")["Score"].mean()

results = pd.DataFrame({
    "Score (Avg)": raw_means,
    "95% CI": ci_df["CI_Formatted"],
    "Noise (Z-Var)": stability_df["Stability_Noise"]
})

results = results.sort_values("Score (Avg)", ascending=False)

print("\n=== Analysis V2 Leaderboard ===")
print(results.round(4).to_string())

print("\n[Interpretation]")
print("- Score (Avg):          Higher is better. (LMM-validated performance).")
print("- 95% CI:               Range of plausible scores (resampling benchmarks).")
print("- Noise (Z-Var):        Lower is better. Variance of standardized scores.")
print("  - Low (~0):           Conformist. Fails when others fail, succeeds when others succeed.")
print("  - High (>1):          Maverick. Unpredictable performance relative to task difficulty.")

# 5. Correlations
print("\n=== Strategy Correlations (Spearman) ===")
print(analyze_correlations(df).round(2))


=== Analysis V2 Leaderboard ===
          Score (Avg)        95% CI  Noise (Z-Var)
Strategy                                          
foa            0.5462  [0.38, 0.76]         0.3487
tot_bfs        0.5057  [0.34, 0.71]         0.3991
cot_sc         0.4242  [0.26, 0.57]         0.5883
got            0.4200  [0.25, 0.62]         0.3629
cot            0.3976  [0.24, 0.55]         0.5816
react          0.3912  [0.24, 0.55]         1.0503
io             0.3135  [0.16, 0.49]         0.3385
tot_dfs        0.1772  [0.02, 0.39]         1.2767

[Interpretation]
- Score (Avg):          Higher is better. (LMM-validated performance).
- 95% CI:               Range of plausible scores (resampling benchmarks).
- Noise (Z-Var):        Lower is better. Variance of standardized scores.
  - Low (~0):           Conformist. Fails when others fail, succeeds when others succeed.
  - High (>1):          Maverick. Unpredictable performance relative to task difficulty.

=== Strategy Correlations (Spearman) ==