# Clean Residual Gain Validation - WITHOUT Final LayerNorm

## Critical Discovery

Previous experiments showed **BOTH** Pythia and GPT-J dampening:
- Pythia-6.9B: G = 0.320
- GPT-J-6B: G = 0.373

**But this was WRONG!** We were measuring:
```
G = ||hidden_states[-1]|| / ||hidden_states[-2]||
```

In HuggingFace:
- `hidden_states[-1]` = AFTER final LayerNorm (`transformer.ln_f`)
- `hidden_states[-2]` = Output of last transformer block

The final LayerNorm **normalizes** and **shrinks** the output!

## Correct Methodology

We need to measure the TRUE residual stream gain:
```
G_true = ||layer_L_output|| / ||layer_{L-1}_output||
       = ||hidden_states[-2]|| / ||hidden_states[-3]||
```

This excludes the final LayerNorm artifact.

## H25 Hypothesis

$$\rho = \frac{n_{heads}}{d_{head}}$$

- Pythia: $\rho = 32/128 = 0.25$ (HIGH) → Should DAMPEN
- GPT-J: $\rho = 16/256 = 0.0625$ (LOW) → Should EXPAND

In [None]:
!pip install transformers torch numpy matplotlib --quiet

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import json
from datetime import datetime
import gc
import warnings
warnings.filterwarnings('ignore')

# Fixed seed
torch.manual_seed(42)
np.random.seed(42)

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Canonical prompts - SAME for all experiments
CANONICAL_PROMPTS = [
    "The capital of France is",
    "Water freezes at",
    "The quick brown fox",
]

# Models to compare
MODELS = [
    ('pythia-6.9b', 'EleutherAI/pythia-6.9b'),
    ('gpt-j-6b', 'EleutherAI/gpt-j-6B'),
]

# Fallback for smaller GPUs
FALLBACK_MODELS = [
    ('pythia-1.4b', 'EleutherAI/pythia-1.4b'),
    ('pythia-410m', 'EleutherAI/pythia-410m'),
]

print(f"Prompts: {CANONICAL_PROMPTS}")

In [None]:
def compute_residual_gains_detailed(model, tokenizer, prompts, verbose=True):
    """
    Compute residual gains with CORRECT methodology.
    
    Returns TWO metrics:
    1. gain_with_ln: ||hidden_states[-1]|| / ||hidden_states[-2]|| (WRONG - includes final LN)
    2. gain_no_ln: ||hidden_states[-2]|| / ||hidden_states[-3]|| (CORRECT - true last layer)
    
    HuggingFace hidden_states structure:
    - hidden_states[0] = embedding output
    - hidden_states[1] = layer 0 output
    - hidden_states[2] = layer 1 output
    - ...
    - hidden_states[n_layers] = last transformer block output
    - hidden_states[n_layers+1] = AFTER final LayerNorm (if model has one)
    
    For models WITHOUT separate final LN in hidden_states:
    - hidden_states[-1] = last transformer block output
    - hidden_states[-2] = second-to-last block output
    """
    gains_with_ln = []
    gains_no_ln = []
    all_layer_gains = []
    
    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt")
        if torch.cuda.is_available():
            inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)
        
        hidden_states = outputs.hidden_states
        n_states = len(hidden_states)
        
        if verbose:
            print(f"\n  Prompt: '{prompt}'")
            print(f"  Hidden states: {n_states}")
        
        # Compute norms for all hidden states (last token)
        norms = []
        for i, h in enumerate(hidden_states):
            norm = torch.norm(h[:, -1, :].float(), dim=-1).item()
            norms.append(norm)
        
        # Compute layer-by-layer gains
        layer_gains = []
        for i in range(1, len(norms)):
            gain = norms[i] / (norms[i-1] + 1e-10)
            layer_gains.append(gain)
        
        all_layer_gains.append(layer_gains)
        
        # WRONG metric (includes final LN): last gain
        gain_with_ln = layer_gains[-1]
        gains_with_ln.append(gain_with_ln)
        
        # CORRECT metric (no final LN): second-to-last gain
        # This is ||layer_L|| / ||layer_{L-1}||
        gain_no_ln = layer_gains[-2] if len(layer_gains) >= 2 else layer_gains[-1]
        gains_no_ln.append(gain_no_ln)
        
        if verbose:
            print(f"    Norms: embed={norms[0]:.1f}, ..., L-2={norms[-3]:.1f}, L-1={norms[-2]:.1f}, final={norms[-1]:.1f}")
            print(f"    Gain (with final LN):    {gain_with_ln:.4f}")
            print(f"    Gain (WITHOUT final LN): {gain_no_ln:.4f}")
    
    # Aggregate
    avg_layer_gains = np.mean(all_layer_gains, axis=0)
    
    return {
        'gain_with_ln_mean': float(np.mean(gains_with_ln)),
        'gain_with_ln_std': float(np.std(gains_with_ln)),
        'gain_no_ln_mean': float(np.mean(gains_no_ln)),
        'gain_no_ln_std': float(np.std(gains_no_ln)),
        'all_layer_gains': avg_layer_gains.tolist(),
        'is_dampening_with_ln': bool(np.mean(gains_with_ln) < 1.0),
        'is_dampening_no_ln': bool(np.mean(gains_no_ln) < 1.0),
        'n_hidden_states': n_states
    }

In [None]:
def compute_rho(config):
    """Compute head density rho = n_heads / d_head."""
    n_heads = getattr(config, 'num_attention_heads', None) or \
              getattr(config, 'n_head', None)
    d_model = getattr(config, 'hidden_size', None) or \
              getattr(config, 'n_embd', None)
    n_layers = getattr(config, 'num_hidden_layers', None) or \
               getattr(config, 'n_layer', None)
    
    if n_heads and d_model:
        d_head = d_model // n_heads
        rho = n_heads / d_head
        return {
            'n_heads': int(n_heads),
            'd_model': int(d_model),
            'd_head': int(d_head),
            'n_layers': int(n_layers) if n_layers else None,
            'rho': float(rho)
        }
    return None

In [None]:
# Select models based on GPU
if torch.cuda.is_available():
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    if gpu_mem >= 20:
        models_to_test = MODELS
        print(f"GPU has {gpu_mem:.1f}GB - testing full 6B models")
    else:
        models_to_test = FALLBACK_MODELS
        print(f"GPU has {gpu_mem:.1f}GB - using smaller models")
else:
    models_to_test = FALLBACK_MODELS
    print("No GPU - using smaller models")

print(f"\nModels: {[m[0] for m in models_to_test]}")

In [None]:
# Main experiment
results = {}

for name, path in models_to_test:
    print(f"\n{'='*70}")
    print(f"Model: {name}")
    print(f"{'='*70}")
    
    try:
        # Load config
        config = AutoConfig.from_pretrained(path)
        rho_info = compute_rho(config)
        
        if rho_info:
            print(f"\n Architecture:")
            print(f"   n_heads = {rho_info['n_heads']}")
            print(f"   d_head = {rho_info['d_head']}")
            print(f"   n_layers = {rho_info['n_layers']}")
            print(f"   rho = {rho_info['rho']:.4f}")
        
        # Load model with float32 for precision
        tokenizer = AutoTokenizer.from_pretrained(path)
        model = AutoModelForCausalLM.from_pretrained(
            path,
            torch_dtype=torch.float32,
            device_map="auto" if torch.cuda.is_available() else None,
            low_cpu_mem_usage=True
        )
        model.eval()
        
        print(f"\n Computing gains...")
        result = compute_residual_gains_detailed(model, tokenizer, CANONICAL_PROMPTS)
        
        # Add metadata
        result['model'] = name
        result['path'] = path
        if rho_info:
            result.update(rho_info)
        
        # Summary
        print(f"\n{'='*50}")
        print(f"RESULTS for {name}:")
        print(f"{'='*50}")
        print(f"\n  WITH final LayerNorm (WRONG):")
        print(f"    Gain = {result['gain_with_ln_mean']:.4f} +/- {result['gain_with_ln_std']:.4f}")
        status_with = "DAMPENING" if result['is_dampening_with_ln'] else "EXPANSION"
        print(f"    Status: {status_with}")
        
        print(f"\n  WITHOUT final LayerNorm (CORRECT):")
        print(f"    Gain = {result['gain_no_ln_mean']:.4f} +/- {result['gain_no_ln_std']:.4f}")
        status_no = "DAMPENING" if result['is_dampening_no_ln'] else "EXPANSION"
        print(f"    Status: {status_no}")
        
        results[name] = result
        
        # Cleanup
        del model, tokenizer
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()

print(f"\n\nTested {len(results)} models")

In [None]:
# Comparison Table
print("\n" + "="*90)
print("COMPARISON: With vs Without Final LayerNorm")
print("="*90)

print(f"\n{'Model':<15} {'rho':>8} {'Gain+LN':>12} {'Status+LN':>12} {'Gain-LN':>12} {'Status-LN':>12}")
print("-"*75)

for name, r in results.items():
    rho = r.get('rho', 0)
    
    gain_with = r['gain_with_ln_mean']
    status_with = "DAMPEN" if r['is_dampening_with_ln'] else "EXPAND"
    
    gain_no = r['gain_no_ln_mean']
    status_no = "DAMPEN" if r['is_dampening_no_ln'] else "EXPAND"
    
    print(f"{name:<15} {rho:>8.4f} {gain_with:>12.4f} {status_with:>12} {gain_no:>12.4f} {status_no:>12}")

In [None]:
# H25 Validation with CORRECT metric
print("\n" + "="*70)
print("H25 VALIDATION (Using CORRECT metric - no final LN)")
print("="*70)

print("\nH25: rho >= 0.2 -> DAMPENING, rho < 0.2 -> EXPANSION")

correct = 0
total = len(results)

for name, r in results.items():
    rho = r.get('rho', 0)
    gain = r['gain_no_ln_mean']  # CORRECT metric
    
    pred_dampen = rho >= 0.2
    actual_dampen = gain < 1.0
    
    is_correct = pred_dampen == actual_dampen
    if is_correct:
        correct += 1
    
    print(f"\n  {name}:")
    print(f"    rho = {rho:.4f}")
    print(f"    H25 predicts: {'DAMPEN' if pred_dampen else 'EXPAND'}")
    print(f"    Actual (no LN): G = {gain:.4f} -> {'DAMPEN' if actual_dampen else 'EXPAND'}")
    print(f"    {'CORRECT' if is_correct else 'WRONG'}")

accuracy = 100 * correct / total if total > 0 else 0
print(f"\n H25 Accuracy: {correct}/{total} = {accuracy:.0f}%")

if accuracy >= 100:
    verdict = "VALIDATED"
    print(f"\n H25 VALIDATED!")
elif accuracy >= 50:
    verdict = "PARTIALLY_VALIDATED"
    print(f"\n H25 PARTIALLY VALIDATED")
else:
    verdict = "FALSIFIED"
    print(f"\n H25 FALSIFIED")

In [None]:
# Visualization
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
fig.suptitle('Residual Gain: With vs Without Final LayerNorm', fontsize=14, fontweight='bold')

# Panel 1: Bar comparison
ax = axes[0]
names = list(results.keys())
x = np.arange(len(names))
width = 0.35

gains_with = [r['gain_with_ln_mean'] for r in results.values()]
gains_no = [r['gain_no_ln_mean'] for r in results.values()]

bars1 = ax.bar(x - width/2, gains_with, width, label='With Final LN (WRONG)', color='red', alpha=0.7)
bars2 = ax.bar(x + width/2, gains_no, width, label='Without Final LN (CORRECT)', color='blue', alpha=0.7)

ax.axhline(y=1.0, color='black', linestyle='--', alpha=0.5, label='G=1.0')
ax.set_xticks(x)
ax.set_xticklabels(names)
ax.set_ylabel('Last Layer Gain')
ax.set_title('Comparison: Metric Matters!')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# Panel 2: rho vs Gain (correct metric)
ax = axes[1]
rhos = [r.get('rho', 0) for r in results.values()]
colors = ['blue' if g < 1.0 else 'red' for g in gains_no]

ax.scatter(rhos, gains_no, c=colors, s=200, edgecolors='black', linewidth=2, zorder=5)
for i, name in enumerate(names):
    ax.annotate(name, (rhos[i], gains_no[i]), xytext=(5, 5), textcoords='offset points', fontsize=10)

ax.axhline(y=1.0, color='black', linestyle='--', alpha=0.5, label='G=1.0 (Bentov)')
ax.axvline(x=0.2, color='purple', linestyle=':', alpha=0.5, label='rho=0.2')
ax.set_xlabel('rho = n_heads / d_head')
ax.set_ylabel('Last Layer Gain (no final LN)')
ax.set_title('H25: rho vs Gain (CORRECT)')
ax.legend()
ax.grid(True, alpha=0.3)

# Panel 3: Layer-by-layer dynamics
ax = axes[2]
for name, r in results.items():
    gains = r['all_layer_gains']
    layers = list(range(len(gains)))
    color = 'blue' if r['is_dampening_no_ln'] else 'red'
    ax.plot(layers, gains, '-o', markersize=2, label=f"{name} (rho={r.get('rho', 0):.2f})", color=color)

ax.axhline(y=1.0, color='black', linestyle='--', alpha=0.5)

# Mark the final LN artifact
ax.axvspan(len(results[list(results.keys())[0]]['all_layer_gains']) - 1.5,
           len(results[list(results.keys())[0]]['all_layer_gains']) - 0.5,
           alpha=0.2, color='red', label='Final LN (artifact)')

ax.set_xlabel('Layer Index')
ax.set_ylabel('Gain')
ax.set_title('Layer-by-Layer (red zone = final LN)')
ax.legend(fontsize=8)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('clean_residual_NO_FINAL_LN.png', dpi=150, bbox_inches='tight')
plt.show()
print("\nSaved: clean_residual_NO_FINAL_LN.png")

In [None]:
# Save results
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

output = {
    'experiment': 'Clean Residual Gain - NO FINAL LAYERNORM',
    'date': datetime.now().isoformat(),
    'prompts': CANONICAL_PROMPTS,
    'precision': 'float32',
    'methodology': {
        'wrong_metric': 'hidden_states[-1] / hidden_states[-2] (includes final LN)',
        'correct_metric': 'hidden_states[-2] / hidden_states[-3] (true last layer gain)'
    },
    'h25_verdict': verdict,
    'h25_accuracy': accuracy,
    'results': results
}

filename = f'clean_residual_NO_FINAL_LN_{timestamp}.json'
with open(filename, 'w') as f:
    json.dump(output, f, indent=2, default=str)

print(f"Saved: {filename}")

In [None]:
# Download
try:
    from google.colab import files
    files.download(filename)
    files.download('clean_residual_NO_FINAL_LN.png')
except ImportError:
    print("Not in Colab")

In [None]:
# Final Summary
print("\n" + "="*70)
print("FINAL SUMMARY")
print("="*70)

print("\n CRITICAL FINDING:")
print("   Previous measurements included FINAL LAYERNORM artifact!")
print("   This caused BOTH models to appear as 'dampening'.")

print("\n CORRECTED RESULTS (without final LN):")
for name, r in results.items():
    rho = r.get('rho', 0)
    gain_wrong = r['gain_with_ln_mean']
    gain_correct = r['gain_no_ln_mean']
    status = "DAMPEN" if r['is_dampening_no_ln'] else "EXPAND"
    
    print(f"\n   {name}:")
    print(f"     rho = {rho:.4f}")
    print(f"     WRONG (with LN):    G = {gain_wrong:.4f}")
    print(f"     CORRECT (no LN):    G = {gain_correct:.4f} -> {status}")

print(f"\n H25 VERDICT: {verdict}")
print(f"   Accuracy: {accuracy:.0f}%")

if accuracy == 100:
    print("\n   High rho -> DAMPENING")
    print("   Low rho -> EXPANSION")
    print("   The 'Dimensional Crowding Hypothesis' is CONFIRMED!")