# Clean Residual Gain Validation

**Problem:** Previous experiments showed INCONSISTENT results for GPT-J:
- Beautiful Ones: G = 1.058 (EXPANSION)
- High-œÅ Hunt: G = 0.368 (DAMPENING)

**Goal:** Run a CLEAN, reproducible test with:
- Same prompts
- Same methodology
- Same models
- Clear documentation of what's measured

**Key Metric:**
$$G = \frac{||h_L||}{||h_{L-1}||} = \text{Last Layer Residual Gain}$$

Where $h_L$ = hidden state after final transformer block.

In [None]:
!pip install transformers torch numpy --quiet

In [None]:
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import json
from datetime import datetime
import gc
import warnings
warnings.filterwarnings('ignore')

# FIXED seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# EXACTLY the same prompts for ALL experiments
CANONICAL_PROMPTS = [
    "The capital of France is",
    "Water freezes at",
    "The quick brown fox",
]

# Models to compare
MODELS = [
    ('pythia-6.9b', 'EleutherAI/pythia-6.9b'),
    ('gpt-j-6b', 'EleutherAI/gpt-j-6B'),
]

# Fallback for smaller GPUs
FALLBACK_MODELS = [
    ('pythia-1.4b', 'EleutherAI/pythia-1.4b'),
    ('pythia-410m', 'EleutherAI/pythia-410m'),
]

print(f"Prompts: {CANONICAL_PROMPTS}")

In [None]:
def compute_residual_gain_clean(model, tokenizer, prompts, verbose=True):
    """
    CLEAN implementation of residual gain computation.
    
    Returns:
        - last_layer_gain: ||h_L|| / ||h_{L-1}|| for last layer
        - all_layer_gains: list of gains for each layer
        - per_prompt_gains: dict of prompt -> last_layer_gain
    """
    all_layer_gains = []
    last_layer_gains = []
    per_prompt = {}
    
    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt")
        if torch.cuda.is_available():
            inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)
        
        hidden_states = outputs.hidden_states
        n_layers = len(hidden_states) - 1  # Exclude embedding layer
        
        if verbose:
            print(f"\n  Prompt: '{prompt[:30]}...'")
            print(f"  Hidden states: {len(hidden_states)} (1 embed + {n_layers} layers)")
        
        # Compute gain for each layer transition
        layer_gains = []
        for i in range(1, len(hidden_states)):
            h_curr = hidden_states[i][:, -1, :].float()  # Last token
            h_prev = hidden_states[i-1][:, -1, :].float()
            
            norm_curr = torch.norm(h_curr, dim=-1).item()
            norm_prev = torch.norm(h_prev, dim=-1).item()
            
            gain = norm_curr / (norm_prev + 1e-10)
            layer_gains.append(gain)
            
            if verbose and (i == 1 or i == len(hidden_states) - 1):
                layer_name = "Layer 0" if i == 1 else f"Layer {i-1} (LAST)"
                print(f"    {layer_name}: ||h||={norm_curr:.2f}, gain={gain:.4f}")
        
        all_layer_gains.append(layer_gains)
        last_layer_gain = layer_gains[-1]
        last_layer_gains.append(last_layer_gain)
        per_prompt[prompt] = last_layer_gain
    
    # Aggregate
    mean_last_gain = np.mean(last_layer_gains)
    std_last_gain = np.std(last_layer_gains)
    
    # Average across prompts, per layer
    avg_layer_gains = np.mean(all_layer_gains, axis=0)
    
    return {
        'last_layer_gain_mean': float(mean_last_gain),
        'last_layer_gain_std': float(std_last_gain),
        'all_layer_gains': avg_layer_gains.tolist(),
        'per_prompt_gains': per_prompt,
        'is_dampening': mean_last_gain < 1.0,
        'n_layers': n_layers
    }

In [None]:
def compute_rho(config):
    """Compute head density œÅ = n_heads / d_head."""
    n_heads = getattr(config, 'num_attention_heads', None) or \
              getattr(config, 'n_head', None)
    d_model = getattr(config, 'hidden_size', None) or \
              getattr(config, 'n_embd', None)
    
    if n_heads and d_model:
        d_head = d_model // n_heads
        rho = n_heads / d_head
        return {
            'n_heads': n_heads,
            'd_model': d_model,
            'd_head': d_head,
            'rho': rho
        }
    return None

In [None]:
# Select models based on GPU memory
if torch.cuda.is_available():
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    if gpu_mem >= 20:
        models_to_test = MODELS
        print(f"‚úÖ GPU has {gpu_mem:.1f}GB - testing full models")
    else:
        models_to_test = FALLBACK_MODELS
        print(f"‚ö†Ô∏è GPU has {gpu_mem:.1f}GB - using fallback models")
else:
    models_to_test = FALLBACK_MODELS
    print("‚ö†Ô∏è No GPU - using fallback models")

print(f"\nModels to test: {[m[0] for m in models_to_test]}")

In [None]:
# Run clean validation
results = {}

for name, path in models_to_test:
    print(f"\n{'='*70}")
    print(f"Testing: {name}")
    print(f"{'='*70}")
    
    try:
        # Load config first
        config = AutoConfig.from_pretrained(path)
        rho_info = compute_rho(config)
        
        if rho_info:
            print(f"\nüìê Architecture:")
            print(f"   n_heads = {rho_info['n_heads']}")
            print(f"   d_head = {rho_info['d_head']}")
            print(f"   œÅ = {rho_info['rho']:.4f}")
        
        # Load model
        tokenizer = AutoTokenizer.from_pretrained(path)
        model = AutoModelForCausalLM.from_pretrained(
            path,
            torch_dtype=torch.float32,  # Use fp32 for precision!
            device_map="auto" if torch.cuda.is_available() else None,
            low_cpu_mem_usage=True
        )
        model.eval()
        
        print(f"\nüî¨ Computing residual gains...")
        result = compute_residual_gain_clean(model, tokenizer, CANONICAL_PROMPTS)
        
        # Add model info
        result['model'] = name
        result['path'] = path
        if rho_info:
            result.update(rho_info)
        
        # Summary
        status = "DAMPENING üîµ" if result['is_dampening'] else "EXPANSION üî¥"
        print(f"\nüìä RESULT:")
        print(f"   Last Layer Gain: {result['last_layer_gain_mean']:.4f} ¬± {result['last_layer_gain_std']:.4f}")
        print(f"   Status: {status}")
        
        results[name] = result
        
        # Cleanup
        del model, tokenizer
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            
    except Exception as e:
        print(f"‚ùå Error: {e}")
        import traceback
        traceback.print_exc()

print(f"\n\n‚úÖ Tested {len(results)} models")

In [None]:
# Summary Table
print("\n" + "="*80)
print("CLEAN RESIDUAL GAIN VALIDATION - SUMMARY")
print("="*80)

print(f"\n{'Model':<20} {'œÅ':>10} {'Last Gain':>12} {'Status':>15} {'H25 Pred':>10}")
print("-"*70)

for name, r in results.items():
    rho = r.get('rho', 0)
    gain = r['last_layer_gain_mean']
    status = "DAMPEN" if r['is_dampening'] else "EXPAND"
    
    # H25 prediction: œÅ ‚â• 0.2 ‚Üí should dampen
    h25_pred = (rho >= 0.2 and r['is_dampening']) or (rho < 0.2 and not r['is_dampening'])
    h25_marker = "‚úÖ" if h25_pred else "‚ùå"
    
    print(f"{name:<20} {rho:>10.4f} {gain:>12.4f} {status:>15} {h25_marker:>10}")

In [None]:
# Visualization
import matplotlib.pyplot as plt

if len(results) >= 1:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Plot 1: œÅ vs Last Layer Gain
    ax = axes[0]
    rhos = [r.get('rho', 0) for r in results.values()]
    gains = [r['last_layer_gain_mean'] for r in results.values()]
    names = list(results.keys())
    colors = ['blue' if g < 1.0 else 'red' for g in gains]
    
    ax.scatter(rhos, gains, c=colors, s=200, edgecolors='black', linewidth=2, zorder=5)
    for i, name in enumerate(names):
        ax.annotate(name, (rhos[i], gains[i]), xytext=(5, 5), textcoords='offset points', fontsize=10)
    
    ax.axhline(y=1.0, color='black', linestyle='--', alpha=0.7, label='G=1.0 (Bentov)')
    ax.axvline(x=0.2, color='purple', linestyle=':', alpha=0.7, label='œÅ=0.2')
    ax.set_xlabel('œÅ = n_heads / d_head', fontsize=12)
    ax.set_ylabel('Last Layer Residual Gain', fontsize=12)
    ax.set_title('H25: Does œÅ predict Gain?', fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Plot 2: Layer-by-Layer Dynamics
    ax = axes[1]
    for name, r in results.items():
        layer_gains = r['all_layer_gains']
        layers = list(range(len(layer_gains)))
        color = 'blue' if r['is_dampening'] else 'red'
        ax.plot(layers, layer_gains, '-o', markersize=2, label=f"{name} (œÅ={r.get('rho', 0):.2f})", color=color)
    
    ax.axhline(y=1.0, color='black', linestyle='--', alpha=0.5)
    ax.set_xlabel('Layer', fontsize=12)
    ax.set_ylabel('Residual Gain', fontsize=12)
    ax.set_title('Layer-by-Layer Dynamics', fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('clean_residual_validation.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("\nSaved: clean_residual_validation.png")

In [None]:
# H25 Verdict
print("\n" + "="*70)
print("H25 HYPOTHESIS VALIDATION")
print("="*70)
print("\nH25: œÅ = n_heads / d_head ‚â• 0.2 ‚Üí DAMPENING (G < 1.0)")
print("    œÅ < 0.2 ‚Üí EXPANSION (G ‚â• 1.0)")

correct = 0
total = len(results)

for name, r in results.items():
    rho = r.get('rho', 0)
    gain = r['last_layer_gain_mean']
    
    pred_dampen = rho >= 0.2
    actual_dampen = gain < 1.0
    
    is_correct = pred_dampen == actual_dampen
    if is_correct:
        correct += 1
    
    print(f"\n  {name}:")
    print(f"    œÅ = {rho:.4f} ‚Üí Predicted: {'DAMPEN' if pred_dampen else 'EXPAND'}")
    print(f"    G = {gain:.4f} ‚Üí Actual: {'DAMPEN' if actual_dampen else 'EXPAND'}")
    print(f"    {'‚úÖ CORRECT' if is_correct else '‚ùå WRONG'}")

accuracy = 100 * correct / total if total > 0 else 0
print(f"\nüìä H25 Accuracy: {correct}/{total} = {accuracy:.1f}%")

if accuracy >= 75:
    print("\n‚úÖ H25 VALIDATED")
elif accuracy >= 50:
    print("\n‚ö†Ô∏è H25 PARTIALLY VALIDATED")
else:
    print("\n‚ùå H25 FALSIFIED")

In [None]:
# Save results
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

output = {
    'experiment': 'Clean Residual Gain Validation',
    'date': datetime.now().isoformat(),
    'prompts': CANONICAL_PROMPTS,
    'precision': 'float32',
    'methodology': 'hidden_states[-1] / hidden_states[-2], last token, L2 norm',
    'results': results
}

filename = f'clean_residual_validation_{timestamp}.json'
with open(filename, 'w') as f:
    json.dump(output, f, indent=2, default=str)

print(f"\nSaved: {filename}")

In [None]:
# Download
try:
    from google.colab import files
    files.download(filename)
    files.download('clean_residual_validation.png')
except ImportError:
    print("Not in Colab")

In [None]:
# Final Summary
print("\n" + "="*70)
print("CLEAN VALIDATION COMPLETE")
print("="*70)
print(f"\nüìÅ Files:")
print(f"   ‚Ä¢ {filename}")
print(f"   ‚Ä¢ clean_residual_validation.png")
print(f"\nüìä Key Finding:")

for name, r in results.items():
    status = "DAMPENING" if r['is_dampening'] else "EXPANSION"
    print(f"   {name}: G = {r['last_layer_gain_mean']:.4f} ({status})")