# SAE Steering Experiment Analysis

Analyzing results from the Eiffel Tower Llama reproduction experiments.

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

## Load Experiment Results

In [None]:
def load_experiment(log_folder):
    """Load results from a sweep experiment."""
    results_path = Path(log_folder) / 'results.json'
    if results_path.exists():
        with open(results_path) as f:
            return pd.DataFrame(json.load(f))
    return None

# Find all log folders
log_dir = Path('../logs')
if log_dir.exists():
    experiments = sorted(log_dir.glob('*'))
    print(f"Found {len(experiments)} experiment folders:")
    for exp in experiments:
        print(f"  - {exp.name}")

## Analysis Functions

In [None]:
def compute_harmonic_mean(concept, instruction, fluency):
    """Compute harmonic mean of three LLM metrics."""
    if concept * instruction * fluency < 1e-6:
        return 0.0
    return 3.0 / (1.0/concept + 1.0/instruction + 1.0/fluency)

def plot_sweep_results(df, title="Steering Sweep Results"):
    """Plot metrics vs steering intensity."""
    fig, axes = plt.subplots(2, 3, figsize=(14, 8))
    
    # Group by steering intensity and compute stats
    grouped = df.groupby('steering_intensity').agg({
        'llm_score_concept': ['mean', 'std'],
        'llm_score_instruction': ['mean', 'std'],
        'llm_score_fluency': ['mean', 'std'],
        'avg_log_prob': ['mean', 'std'],
        'rep3': ['mean', 'std']
    }).reset_index()
    
    x = grouped['steering_intensity']
    
    # LLM metrics
    for ax, metric, label in zip(
        axes[0],
        ['llm_score_concept', 'llm_score_instruction', 'llm_score_fluency'],
        ['Concept Inclusion', 'Instruction Following', 'Fluency']
    ):
        mean = grouped[(metric, 'mean')]
        std = grouped[(metric, 'std')]
        ax.plot(x, mean, 'o-', label=label)
        ax.fill_between(x, mean - std, mean + std, alpha=0.2)
        ax.set_xlabel('Steering Intensity (α)')
        ax.set_ylabel('Score (0-2)')
        ax.set_title(label)
        ax.set_ylim(-0.1, 2.1)
    
    # Auxiliary metrics
    ax = axes[1, 0]
    mean = grouped[('avg_log_prob', 'mean')]
    std = grouped[('avg_log_prob', 'std')]
    ax.plot(x, -mean, 'o-')  # Negative for "surprise"
    ax.fill_between(x, -(mean - std), -(mean + std), alpha=0.2)
    ax.set_xlabel('Steering Intensity (α)')
    ax.set_ylabel('Surprise (neg log prob)')
    ax.set_title('Surprise in Reference Model')
    
    ax = axes[1, 1]
    mean = grouped[('rep3', 'mean')]
    std = grouped[('rep3', 'std')]
    ax.plot(x, mean, 'o-')
    ax.fill_between(x, mean - std, mean + std, alpha=0.2)
    ax.set_xlabel('Steering Intensity (α)')
    ax.set_ylabel('Rep3 Fraction')
    ax.set_title('3-gram Repetition')
    
    # Harmonic mean
    ax = axes[1, 2]
    if 'llm_score_concept' in df.columns and df['llm_score_concept'].notna().any():
        df['harmonic_mean'] = df.apply(
            lambda r: compute_harmonic_mean(
                r['llm_score_concept'] or 0,
                r['llm_score_instruction'] or 0, 
                r['llm_score_fluency'] or 0
            ), axis=1)
        hm_grouped = df.groupby('steering_intensity')['harmonic_mean'].agg(['mean', 'std']).reset_index()
        ax.plot(hm_grouped['steering_intensity'], hm_grouped['mean'], 'o-')
        ax.fill_between(hm_grouped['steering_intensity'], 
                       hm_grouped['mean'] - hm_grouped['std'],
                       hm_grouped['mean'] + hm_grouped['std'], alpha=0.2)
    ax.set_xlabel('Steering Intensity (α)')
    ax.set_ylabel('Harmonic Mean')
    ax.set_title('Harmonic Mean of LLM Scores')
    
    plt.suptitle(title)
    plt.tight_layout()
    return fig

## Exp01: Baseline Sweep Analysis

In [None]:
# Load exp01 results (update path after running)
# df_exp01 = load_experiment('../logs/YYYYMMDD-HHMMSS_sweep_1D')
# if df_exp01 is not None:
#     plot_sweep_results(df_exp01, "Exp01: Baseline Steering Sweep")

## Exp02: Clamping Comparison

In [None]:
def compare_clamping(df_add, df_clamp):
    """Compare additive vs clamping steering."""
    fig, axes = plt.subplots(1, 3, figsize=(14, 4))
    
    for df, label, color in [(df_add, 'Additive', 'blue'), (df_clamp, 'Clamping', 'red')]:
        grouped = df.groupby('steering_intensity').agg({
            'llm_score_concept': 'mean',
            'llm_score_fluency': 'mean'
        }).reset_index()
        
        axes[0].plot(grouped['steering_intensity'], grouped['llm_score_concept'], 
                    'o-', label=label, color=color)
        axes[1].plot(grouped['steering_intensity'], grouped['llm_score_fluency'],
                    'o-', label=label, color=color)
        
        # Harmonic mean
        df['hm'] = df.apply(lambda r: compute_harmonic_mean(
            r['llm_score_concept'] or 0, r['llm_score_instruction'] or 0, r['llm_score_fluency'] or 0
        ), axis=1)
        hm_grouped = df.groupby('steering_intensity')['hm'].mean().reset_index()
        axes[2].plot(hm_grouped['steering_intensity'], hm_grouped['hm'],
                    'o-', label=label, color=color)
    
    axes[0].set_title('Concept Inclusion')
    axes[1].set_title('Fluency')
    axes[2].set_title('Harmonic Mean')
    
    for ax in axes:
        ax.set_xlabel('Steering Intensity (α)')
        ax.legend()
    
    plt.suptitle('Exp02: Clamping vs Additive Steering')
    plt.tight_layout()
    return fig

## Correlation Analysis

In [None]:
def plot_correlation_matrix(df):
    """Plot correlation matrix between metrics."""
    metrics = ['llm_score_concept', 'llm_score_instruction', 'llm_score_fluency',
               'avg_log_prob', 'rep3']
    
    # Filter to only metrics that exist
    metrics = [m for m in metrics if m in df.columns and df[m].notna().any()]
    
    if len(metrics) > 0:
        corr = df[metrics].corr()
        
        fig, ax = plt.subplots(figsize=(8, 6))
        sns.heatmap(corr, annot=True, cmap='RdBu_r', center=0, 
                   vmin=-1, vmax=1, ax=ax)
        ax.set_title('Metric Correlation Matrix')
        return fig
    return None

## Sample Outputs

In [None]:
def show_sample_outputs(df, steering_intensities=[0, 5, 8, 12]):
    """Show sample outputs at different steering intensities."""
    for intensity in steering_intensities:
        # Find closest intensity
        closest = df.iloc[(df['steering_intensity'] - intensity).abs().argsort()[:1]]
        if len(closest) > 0:
            row = closest.iloc[0]
            print(f"\n{'='*60}")
            print(f"Steering Intensity: {row['steering_intensity']:.1f}")
            print(f"Prompt: {row['prompt'][:100]}...")
            print(f"\nAnswer: {row['answer'][:500]}...")
            if row.get('llm_score_concept') is not None:
                print(f"\nScores: C={row['llm_score_concept']}, I={row['llm_score_instruction']}, F={row['llm_score_fluency']}")

## Key Findings Summary

After running experiments, document key findings here:

### 1. Optimal Steering Coefficient
- Paper claims α ≈ 8.5 for layer 15 feature #21576
- Our finding: TBD

### 2. Clamping vs Additive
- Paper claims clamping improves concept inclusion
- Our finding: TBD

### 3. Multi-Feature Steering
- Paper claims marginal improvement
- Our finding: TBD

### 4. Generation Parameters
- Paper claims temp=0.5 + rep_penalty=1.1 helps
- Our finding: TBD