# The "Beautiful Ones" Analysis: Per-Head Contribution to Dampening

**Paper #3 Experiment:** Understanding WHY Pythia dampens

**Universe 25 Analogy:**
In Calhoun's mouse utopia experiment, some mice ("Beautiful Ones") withdrew from social interaction and focused only on self-grooming. They were physically perfect but socially non-functional.

**Hypothesis:** In Pythia's crowded feature space (high œÅ), some attention heads may become "Beautiful Ones" - contributing NEGATIVELY to the residual stream (anti-correlation), causing overall dampening.

**Key Question:** Are there specific heads that SUBTRACT from the residual stream rather than ADD?

**Measurement:**
For each head h in layer L:
- Attn_output_h = head h's contribution to residual
- Correlation with residual growth = sign of contribution
- Negative correlation = "Beautiful One" (withdrawing energy)

In [None]:
# Install dependencies
!pip install transformers torch matplotlib numpy seaborn --quiet

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import json
from datetime import datetime
import warnings
import gc
warnings.filterwarnings('ignore')

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Models to compare
# Pythia (dampening) vs GPT-J (expansion) - same family, opposite behavior

MODELS = {
    'pythia-6.9b': 'EleutherAI/pythia-6.9b',   # œÅ = 0.25, G ‚âà 0.80 (DAMPEN)
    'gpt-j-6b': 'EleutherAI/gpt-j-6B',          # œÅ = 0.0625, G ‚âà 1.065 (EXPAND)
}

# Select based on GPU memory
if torch.cuda.is_available():
    mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    if mem >= 20:
        MODELS_TO_TEST = list(MODELS.keys())
    elif mem >= 18:
        MODELS_TO_TEST = ['gpt-j-6b']
    else:
        MODELS_TO_TEST = []  # Use smaller models instead
        print("GPU too small for 6B models. Testing smaller variants...")
        MODELS = {
            'pythia-1.4b': 'EleutherAI/pythia-1.4b',
            'pythia-410m': 'EleutherAI/pythia-410m',
        }
        MODELS_TO_TEST = list(MODELS.keys())
else:
    MODELS = {
        'pythia-160m': 'EleutherAI/pythia-160m',
        'pythia-70m': 'EleutherAI/pythia-70m',
    }
    MODELS_TO_TEST = list(MODELS.keys())

print(f"Models to test: {MODELS_TO_TEST}")

In [None]:
# Test prompts
TEST_PROMPTS = [
    "The capital of France is",
    "Water freezes at",
    "The quick brown fox",
    "Actions speak louder than",
    "In mathematics, pi equals approximately",
]

In [None]:
class PerHeadAnalyzer:
    """
    Analyze per-head contributions to residual stream dynamics.
    
    For Pythia (GPT-NeoX architecture):
    - Parallel blocks: attn and mlp see same input
    - residual = x + attn(x) + mlp(x)
    
    We measure how each head's output correlates with residual growth.
    """
    
    def __init__(self, model, tokenizer, n_layers, n_heads):
        self.model = model
        self.tokenizer = tokenizer
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.hooks = []
        self.head_outputs = {}  # {layer: (batch, seq, n_heads, d_head)}
        self.residual_before = {}  # {layer: residual before this layer}
        self.residual_after = {}   # {layer: residual after this layer}
    
    def _hook_attention_output(self, layer_idx):
        """Capture per-head attention outputs."""
        def hook(module, input, output):
            # output shape: (batch, seq, hidden)
            # We need to reshape to (batch, seq, n_heads, d_head)
            attn_output = output[0] if isinstance(output, tuple) else output
            batch, seq, hidden = attn_output.shape
            d_head = hidden // self.n_heads
            
            # Reshape to per-head
            per_head = attn_output.view(batch, seq, self.n_heads, d_head)
            self.head_outputs[layer_idx] = per_head.detach().cpu()
        
        return hook
    
    def _hook_residual(self, layer_idx, position):
        """Capture residual stream before/after layer."""
        def hook(module, input, output):
            if position == 'before':
                tensor = input[0] if isinstance(input, tuple) else input
                self.residual_before[layer_idx] = tensor.detach().cpu()
            else:
                tensor = output[0] if isinstance(output, tuple) else output
                self.residual_after[layer_idx] = tensor.detach().cpu()
        
        return hook
    
    def register_hooks(self):
        """Register hooks for GPT-NeoX/Pythia architecture."""
        self.remove_hooks()
        
        for layer_idx in range(self.n_layers):
            # Try different attribute names for different architectures
            if hasattr(self.model, 'gpt_neox'):
                layer = self.model.gpt_neox.layers[layer_idx]
                attn = layer.attention
            elif hasattr(self.model, 'transformer'):
                layer = self.model.transformer.h[layer_idx]
                attn = layer.attn
            else:
                raise ValueError(f"Unknown model architecture")
            
            # Hook attention output
            self.hooks.append(
                attn.register_forward_hook(self._hook_attention_output(layer_idx))
            )
            
            # Hook residual (layer input/output)
            self.hooks.append(
                layer.register_forward_hook(self._hook_residual(layer_idx, 'before'))
            )
            self.hooks.append(
                layer.register_forward_hook(self._hook_residual(layer_idx, 'after'))
            )
    
    def remove_hooks(self):
        for hook in self.hooks:
            hook.remove()
        self.hooks = []
    
    def clear(self):
        self.head_outputs = {}
        self.residual_before = {}
        self.residual_after = {}
    
    def analyze_prompt(self, prompt):
        """Analyze a single prompt."""
        self.clear()
        
        inputs = self.tokenizer(prompt, return_tensors="pt")
        if torch.cuda.is_available():
            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            _ = self.model(**inputs, output_hidden_states=True)
        
        return self._compute_head_contributions()
    
    def _compute_head_contributions(self):
        """Compute per-head contribution to residual stream."""
        results = []
        
        for layer_idx in range(self.n_layers):
            if layer_idx not in self.head_outputs:
                continue
            
            head_out = self.head_outputs[layer_idx]  # (batch, seq, n_heads, d_head)
            
            # Compute per-head norms (contribution magnitude)
            head_norms = torch.norm(head_out.float(), dim=-1)  # (batch, seq, n_heads)
            
            # Average over batch and sequence
            mean_norms = head_norms.mean(dim=(0, 1))  # (n_heads,)
            
            # Residual growth
            if layer_idx in self.residual_before and layer_idx in self.residual_after:
                res_before = self.residual_before[layer_idx]
                res_after = self.residual_after[layer_idx]
                
                # Compute residual growth at last token
                norm_before = torch.norm(res_before[:, -1, :].float(), dim=-1).item()
                norm_after = torch.norm(res_after[:, -1, :].float(), dim=-1).item()
                residual_growth = norm_after - norm_before
            else:
                residual_growth = 0
            
            results.append({
                'layer': layer_idx,
                'head_norms': mean_norms.numpy(),
                'residual_growth': residual_growth,
                'residual_growth_ratio': norm_after / (norm_before + 1e-10) if layer_idx in self.residual_before else 1.0
            })
        
        return results

In [None]:
def analyze_model(model_name, model_path):
    """Full per-head analysis for a model."""
    print(f"\n{'='*60}")
    print(f"Analyzing: {model_name}")
    print(f"{'='*60}")
    
    # Load model
    config = AutoConfig.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto" if torch.cuda.is_available() else None,
        low_cpu_mem_usage=True
    )
    model.eval()
    
    n_layers = config.num_hidden_layers
    n_heads = config.num_attention_heads
    d_head = config.hidden_size // n_heads
    rho = n_heads / d_head
    
    print(f"Layers: {n_layers}, Heads: {n_heads}, d_head: {d_head}")
    print(f"œÅ = {rho:.4f}")
    
    # Create analyzer
    analyzer = PerHeadAnalyzer(model, tokenizer, n_layers, n_heads)
    analyzer.register_hooks()
    
    # Analyze all prompts
    all_results = []
    for prompt in TEST_PROMPTS:
        results = analyzer.analyze_prompt(prompt)
        all_results.append(results)
    
    analyzer.remove_hooks()
    
    # Aggregate across prompts
    aggregated = []
    for layer_idx in range(n_layers):
        layer_data = [r[layer_idx] for r in all_results if layer_idx < len(r)]
        if layer_data:
            head_norms = np.mean([d['head_norms'] for d in layer_data], axis=0)
            residual_growth = np.mean([d['residual_growth'] for d in layer_data])
            residual_ratio = np.mean([d['residual_growth_ratio'] for d in layer_data])
            
            aggregated.append({
                'layer': layer_idx,
                'head_norms': head_norms,
                'residual_growth': residual_growth,
                'residual_ratio': residual_ratio
            })
    
    # Identify "Beautiful Ones"
    # These are heads with unusually low contribution in layers with negative residual growth
    last_layer = aggregated[-1] if aggregated else None
    
    if last_layer:
        mean_norm = np.mean(last_layer['head_norms'])
        std_norm = np.std(last_layer['head_norms'])
        
        # Beautiful Ones: heads with norms < mean - 1*std
        beautiful_ones = np.where(last_layer['head_norms'] < mean_norm - std_norm)[0]
        
        print(f"\nüìä Last Layer Analysis:")
        print(f"   Residual Ratio: {last_layer['residual_ratio']:.4f}")
        print(f"   Head Norm Mean: {mean_norm:.4f}")
        print(f"   Head Norm Std: {std_norm:.4f}")
        print(f"   'Beautiful Ones' (low contrib): {len(beautiful_ones)} heads")
        if len(beautiful_ones) > 0:
            print(f"   Head indices: {beautiful_ones.tolist()}")
    
    output = {
        'model': model_name,
        'n_layers': n_layers,
        'n_heads': n_heads,
        'd_head': d_head,
        'rho': rho,
        'layers': aggregated,
        'beautiful_ones_last_layer': beautiful_ones.tolist() if last_layer else [],
        'last_layer_residual_ratio': float(last_layer['residual_ratio']) if last_layer else None
    }
    
    # Cleanup
    del model, tokenizer, analyzer
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    return output

In [None]:
# Run analysis
all_model_results = {}

for name in MODELS_TO_TEST:
    path = MODELS[name]
    try:
        results = analyze_model(name, path)
        all_model_results[name] = results
    except Exception as e:
        print(f"Error analyzing {name}: {e}")
        import traceback
        traceback.print_exc()

print(f"\n\nAnalyzed: {len(all_model_results)} models")

In [None]:
# Visualization: Head Norm Heatmaps
n_models = len(all_model_results)

if n_models > 0:
    fig, axes = plt.subplots(1, n_models, figsize=(8 * n_models, 8))
    if n_models == 1:
        axes = [axes]
    
    for idx, (name, results) in enumerate(all_model_results.items()):
        ax = axes[idx]
        
        # Build heatmap matrix: layers x heads
        n_layers = results['n_layers']
        n_heads = results['n_heads']
        
        heatmap = np.zeros((n_layers, n_heads))
        for layer_data in results['layers']:
            layer_idx = layer_data['layer']
            heatmap[layer_idx, :len(layer_data['head_norms'])] = layer_data['head_norms']
        
        # Normalize per layer for visibility
        heatmap_norm = heatmap / (heatmap.max(axis=1, keepdims=True) + 1e-10)
        
        sns.heatmap(heatmap_norm, ax=ax, cmap='viridis', cbar_kws={'label': 'Relative Contribution'})
        ax.set_xlabel('Head Index')
        ax.set_ylabel('Layer')
        ax.set_title(f'{name}\nœÅ = {results["rho"]:.4f}, Last Ratio = {results["last_layer_residual_ratio"]:.4f}')
        
        # Mark "Beautiful Ones" in last layer
        for head_idx in results['beautiful_ones_last_layer']:
            ax.add_patch(plt.Rectangle((head_idx, n_layers - 1), 1, 1, fill=False, 
                                        edgecolor='red', linewidth=2))
    
    plt.suptitle('Per-Head Contribution Heatmaps (Red boxes = "Beautiful Ones")', fontsize=14)
    plt.tight_layout()
    plt.savefig('beautiful_ones_heatmap.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("\nSaved: beautiful_ones_heatmap.png")

In [None]:
# Residual Growth Profile
if n_models > 0:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    for name, results in all_model_results.items():
        layers = [d['layer'] for d in results['layers']]
        ratios = [d['residual_ratio'] for d in results['layers']]
        
        color = 'blue' if results['last_layer_residual_ratio'] < 1.0 else 'red'
        ax.plot(layers, ratios, '-o', label=f"{name} (œÅ={results['rho']:.3f})", color=color, markersize=3)
    
    ax.axhline(y=1.0, color='black', linestyle='--', alpha=0.5, label='G=1.0 (Bentov Point)')
    ax.set_xlabel('Layer')
    ax.set_ylabel('Residual Ratio (||h_l|| / ||h_{l-1}||)')
    ax.set_title('Residual Stream Dynamics by Layer')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('residual_growth_profile.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("\nSaved: residual_growth_profile.png")

In [None]:
# Beautiful Ones Summary
print("\n" + "=" * 70)
print("'BEAUTIFUL ONES' ANALYSIS SUMMARY")
print("=" * 70)

for name, results in all_model_results.items():
    print(f"\nüìä {name}:")
    print(f"   œÅ = {results['rho']:.4f}")
    print(f"   Layers: {results['n_layers']}, Heads: {results['n_heads']}")
    print(f"   Last Layer Ratio: {results['last_layer_residual_ratio']:.4f}")
    
    status = "DAMPENING" if results['last_layer_residual_ratio'] < 1.0 else "EXPANSION"
    print(f"   Status: {status}")
    
    n_beautiful = len(results['beautiful_ones_last_layer'])
    pct_beautiful = 100 * n_beautiful / results['n_heads']
    print(f"   'Beautiful Ones' in Last Layer: {n_beautiful}/{results['n_heads']} ({pct_beautiful:.1f}%)")
    
    if n_beautiful > 0:
        print(f"   Heads: {results['beautiful_ones_last_layer']}")

In [None]:
# Correlation Analysis: Beautiful Ones % vs Dampening
if len(all_model_results) >= 2:
    rhos = [r['rho'] for r in all_model_results.values()]
    beautiful_pcts = [100 * len(r['beautiful_ones_last_layer']) / r['n_heads'] for r in all_model_results.values()]
    residual_ratios = [r['last_layer_residual_ratio'] for r in all_model_results.values()]
    
    print("\n" + "=" * 60)
    print("CORRELATION ANALYSIS")
    print("=" * 60)
    
    # œÅ vs Beautiful Ones %
    if len(rhos) >= 2:
        from scipy import stats
        corr1, p1 = stats.pearsonr(rhos, beautiful_pcts)
        print(f"\nœÅ vs Beautiful Ones %: r = {corr1:.4f} (p = {p1:.4e})")
        
        # Beautiful Ones % vs Dampening
        corr2, p2 = stats.pearsonr(beautiful_pcts, residual_ratios)
        print(f"Beautiful Ones % vs Residual Ratio: r = {corr2:.4f} (p = {p2:.4e})")
        
        if corr1 > 0 and corr2 < 0:
            print("\n‚úÖ Pattern Confirmed: Higher œÅ ‚Üí More 'Beautiful Ones' ‚Üí More Dampening")
        else:
            print("\n‚ö†Ô∏è Pattern not clearly confirmed (need more data points)")

In [None]:
# Save results
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Convert numpy arrays to lists for JSON serialization
serializable_results = {}
for name, results in all_model_results.items():
    r = results.copy()
    r['layers'] = [
        {
            'layer': d['layer'],
            'head_norms': d['head_norms'].tolist() if isinstance(d['head_norms'], np.ndarray) else d['head_norms'],
            'residual_growth': float(d['residual_growth']),
            'residual_ratio': float(d['residual_ratio'])
        }
        for d in results['layers']
    ]
    serializable_results[name] = r

output_data = {
    'experiment': 'Beautiful Ones Per-Head Analysis',
    'hypothesis': 'High œÅ ‚Üí More "Beautiful Ones" (low-contrib heads) ‚Üí Dampening',
    'date': datetime.now().isoformat(),
    'n_prompts': len(TEST_PROMPTS),
    'models': serializable_results
}

filename = f'beautiful_ones_analysis_{timestamp}.json'
with open(filename, 'w') as f:
    json.dump(output_data, f, indent=2)

print(f"\nSaved: {filename}")

In [None]:
# Auto-download
import zipfile

archive_name = f'beautiful_ones_analysis_{timestamp}.zip'

with zipfile.ZipFile(archive_name, 'w') as zf:
    zf.write(filename)
    zf.write('beautiful_ones_heatmap.png')
    zf.write('residual_growth_profile.png')

print(f"Created archive: {archive_name}")

try:
    from google.colab import files
    files.download(filename)
    files.download('beautiful_ones_heatmap.png')
    files.download('residual_growth_profile.png')
    files.download(archive_name)
except ImportError:
    print("Not in Colab - manual download required.")

In [None]:
# Final Summary
print("\n" + "=" * 70)
print("FINAL SUMMARY: Beautiful Ones Analysis")
print("=" * 70)

print(f"\nüìä Models Analyzed: {len(all_model_results)}")

for name, results in all_model_results.items():
    status = "üîµ DAMPEN" if results['last_layer_residual_ratio'] < 1.0 else "üî¥ EXPAND"
    n_beautiful = len(results['beautiful_ones_last_layer'])
    print(f"\n  {name}: {status}")
    print(f"    œÅ = {results['rho']:.4f}")
    print(f"    Last Layer Ratio = {results['last_layer_residual_ratio']:.4f}")
    print(f"    Beautiful Ones = {n_beautiful}/{results['n_heads']}")

print(f"\nüìÅ Output Files:")
print(f"   ‚Ä¢ {filename}")
print(f"   ‚Ä¢ beautiful_ones_heatmap.png")
print(f"   ‚Ä¢ residual_growth_profile.png")
print(f"   ‚Ä¢ {archive_name}")