# Cross-Architecture Validation: Gemma & LLaMA

**Experiment:** Validate Scaling Law and Architectural Patterns across Model Families

**Key Questions:**
1. Is Attention universally contractive across architectures?
2. Does the final MLP always expand?
3. Does the Scaling Law (α ≈ 0.27) hold for non-Pythia models?
4. Do Gemma/LLaMA show FUNNEL/HOUR-GLASS/VASE patterns?

**Models to Test:**
- Gemma-2B (RMSNorm, GeGLU, RoPE)
- Gemma-7B (if GPU allows)
- LLaMA-7B or Mistral-7B

**Hypothesis:**
- RMSNorm (Gemma) might show different contraction patterns than LayerNorm (Pythia)
- But universal principles should hold: Attention contracts, Final MLP explodes

In [None]:
# Cell 1: Imports and Setup
import torch
import numpy as np
import json
from datetime import datetime
import matplotlib.pyplot as plt
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Check GPU
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {gpu_name}")
    print(f"GPU Memory: {gpu_mem:.1f} GB")
else:
    gpu_name = "CPU"
    gpu_mem = 0
    print("Running on CPU - will be slow!")

In [None]:
# Cell 2: Model Selection based on GPU Memory

# Define available models with their requirements
MODELS = {
    # Gemma family
    'gemma-2b': {
        'hf_name': 'google/gemma-2b',
        'params': 2e9,
        'layers': 18,
        'memory_gb': 8,
        'family': 'gemma',
        'norm': 'RMSNorm',
        'mlp': 'GeGLU'
    },
    'gemma-7b': {
        'hf_name': 'google/gemma-7b',
        'params': 7e9,
        'layers': 28,
        'memory_gb': 20,
        'family': 'gemma',
        'norm': 'RMSNorm',
        'mlp': 'GeGLU'
    },
    # LLaMA family
    'llama-7b': {
        'hf_name': 'meta-llama/Llama-2-7b-hf',
        'params': 7e9,
        'layers': 32,
        'memory_gb': 20,
        'family': 'llama',
        'norm': 'RMSNorm',
        'mlp': 'SwiGLU'
    },
    # Mistral (alternative to LLaMA, no gating)
    'mistral-7b': {
        'hf_name': 'mistralai/Mistral-7B-v0.1',
        'params': 7e9,
        'layers': 32,
        'memory_gb': 20,
        'family': 'mistral',
        'norm': 'RMSNorm',
        'mlp': 'SwiGLU'
    },
    # Smaller alternatives
    'phi-2': {
        'hf_name': 'microsoft/phi-2',
        'params': 2.7e9,
        'layers': 32,
        'memory_gb': 8,
        'family': 'phi',
        'norm': 'LayerNorm',
        'mlp': 'MLP'
    },
    'stablelm-3b': {
        'hf_name': 'stabilityai/stablelm-3b-4e1t',
        'params': 3e9,
        'layers': 32,
        'memory_gb': 10,
        'family': 'stablelm',
        'norm': 'LayerNorm',
        'mlp': 'MLP'
    }
}

# Select models based on available memory
def select_models(available_memory_gb):
    """Select models that fit in available GPU memory."""
    selected = []
    
    # Priority: Gemma-2B always (important for comparison)
    if available_memory_gb >= 8:
        selected.append('gemma-2b')
    
    # Try to get one 7B model
    if available_memory_gb >= 20:
        # Prefer Mistral (no login required) over LLaMA
        selected.append('mistral-7b')
        if available_memory_gb >= 40:
            selected.append('gemma-7b')
    elif available_memory_gb >= 10:
        selected.append('stablelm-3b')
    elif available_memory_gb >= 8:
        selected.append('phi-2')
    
    return selected

# Auto-select
if torch.cuda.is_available():
    selected_models = select_models(gpu_mem)
else:
    selected_models = ['gemma-2b']  # CPU fallback

print(f"\nSelected models for testing:")
for m in selected_models:
    info = MODELS[m]
    print(f"  - {m}: {info['params']/1e9:.1f}B params, {info['layers']} layers, {info['norm']}, {info['mlp']}")

In [None]:
# Cell 3: Load Models and Tokenizers
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_model(model_key):
    """Load model with appropriate settings."""
    info = MODELS[model_key]
    hf_name = info['hf_name']
    
    print(f"\nLoading {model_key} ({hf_name})...")
    
    # Determine dtype
    if torch.cuda.is_available():
        if gpu_mem >= 40:
            dtype = torch.float32
        else:
            dtype = torch.float16
    else:
        dtype = torch.float32
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(hf_name, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(
            hf_name,
            torch_dtype=dtype,
            device_map='auto' if torch.cuda.is_available() else None,
            trust_remote_code=True
        )
        
        if not torch.cuda.is_available():
            model = model.to(device)
        
        model.eval()
        print(f"  Loaded successfully! dtype={dtype}, device={device}")
        return model, tokenizer, info
        
    except Exception as e:
        print(f"  Failed to load: {e}")
        return None, None, None

In [None]:
# Cell 4: Hook System for Capturing Activations (FIXED for Gemma/LLaMA)

class ActivationCapturer:
    """Captures input/output norms for attention and MLP layers.
    
    FIXED: Handles models that pass hidden_states as kwargs (Gemma, LLaMA, Mistral).
    Uses forward_pre_hook for input capture and forward_hook for output capture.
    """
    
    def __init__(self, model, model_info):
        self.model = model
        self.model_info = model_info
        self.hooks = []
        self.activations = {}
        self.input_norms = {}  # Store input norms from pre-hooks
        
    def _get_layer_modules(self):
        """Get attention and MLP modules based on model family."""
        family = self.model_info['family']
        
        try:
            if family == 'gemma':
                layers = self.model.model.layers
                return [(l.self_attn, l.mlp) for l in layers]
            elif family in ['llama', 'mistral']:
                layers = self.model.model.layers
                return [(l.self_attn, l.mlp) for l in layers]
            elif family == 'phi':
                # Phi-2 has different structure
                if hasattr(self.model, 'model'):
                    layers = self.model.model.layers
                else:
                    layers = self.model.transformer.h
                return [(l.self_attn if hasattr(l, 'self_attn') else l.attn, 
                         l.mlp) for l in layers]
            elif family == 'stablelm':
                layers = self.model.model.layers
                return [(l.self_attn, l.mlp) for l in layers]
            else:
                raise ValueError(f"Unknown model family: {family}")
        except Exception as e:
            print(f"  Warning: Could not get layer modules: {e}")
            return []
    
    def _extract_hidden_states(self, args, kwargs):
        """Extract hidden_states from either args or kwargs."""
        # Try kwargs first (Gemma, newer transformers)
        if 'hidden_states' in kwargs:
            return kwargs['hidden_states']
        
        # Try positional args
        if args and len(args) > 0:
            # First arg is usually hidden_states
            if isinstance(args[0], torch.Tensor):
                return args[0]
        
        return None
    
    def _make_pre_hook(self, name):
        """Create a pre-hook that captures input norms."""
        def hook(module, args, kwargs=None):
            # Handle both old-style (args only) and new-style (args, kwargs) hooks
            if kwargs is None:
                kwargs = {}
            
            hidden_states = self._extract_hidden_states(args, kwargs)
            
            if hidden_states is not None:
                with torch.no_grad():
                    self.input_norms[name] = hidden_states.float().norm().item()
            else:
                self.input_norms[name] = 0.0
                
        return hook
    
    def _make_post_hook(self, name):
        """Create a post-hook that captures output norms and computes gain."""
        def hook(module, args, output):
            # Handle different output formats
            if isinstance(output, tuple):
                out_tensor = output[0]
            else:
                out_tensor = output
            
            with torch.no_grad():
                out_norm = out_tensor.float().norm().item()
            
            in_norm = self.input_norms.get(name, 0.0)
            
            self.activations[name] = {
                'input_norm': in_norm,
                'output_norm': out_norm,
                'gain': out_norm / in_norm if in_norm > 1e-8 else 0.0
            }
        return hook
    
    def register_hooks(self):
        """Register hooks on all attention and MLP layers."""
        layer_modules = self._get_layer_modules()
        
        if not layer_modules:
            print("  Warning: No layer modules found!")
            return
        
        for i, (attn, mlp) in enumerate(layer_modules):
            # Attention hooks
            try:
                # Try new-style hook with kwargs
                h1_pre = attn.register_forward_pre_hook(
                    self._make_pre_hook(f'attn_{i}'), 
                    with_kwargs=True
                )
            except TypeError:
                # Fallback for older PyTorch
                h1_pre = attn.register_forward_pre_hook(self._make_pre_hook_legacy(f'attn_{i}'))
            
            h1_post = attn.register_forward_hook(self._make_post_hook(f'attn_{i}'))
            self.hooks.extend([h1_pre, h1_post])
            
            # MLP hooks
            try:
                h2_pre = mlp.register_forward_pre_hook(
                    self._make_pre_hook(f'mlp_{i}'),
                    with_kwargs=True
                )
            except TypeError:
                h2_pre = mlp.register_forward_pre_hook(self._make_pre_hook_legacy(f'mlp_{i}'))
            
            h2_post = mlp.register_forward_hook(self._make_post_hook(f'mlp_{i}'))
            self.hooks.extend([h2_pre, h2_post])
        
        print(f"  Registered {len(self.hooks)} hooks on {len(layer_modules)} layers")
    
    def _make_pre_hook_legacy(self, name):
        """Legacy pre-hook for older PyTorch versions."""
        def hook(module, args):
            if args and len(args) > 0 and isinstance(args[0], torch.Tensor):
                with torch.no_grad():
                    self.input_norms[name] = args[0].float().norm().item()
            else:
                self.input_norms[name] = 0.0
        return hook
    
    def remove_hooks(self):
        """Remove all hooks."""
        for h in self.hooks:
            h.remove()
        self.hooks = []
    
    def clear_activations(self):
        """Clear captured activations."""
        self.activations = {}
        self.input_norms = {}
    
    def get_gains(self):
        """Extract attention and MLP gains per layer."""
        n_layers = self.model_info['layers']
        
        attn_gains = []
        mlp_gains = []
        
        for i in range(n_layers):
            attn_key = f'attn_{i}'
            mlp_key = f'mlp_{i}'
            
            if attn_key in self.activations:
                attn_gains.append(self.activations[attn_key]['gain'])
            if mlp_key in self.activations:
                mlp_gains.append(self.activations[mlp_key]['gain'])
        
        return attn_gains, mlp_gains

print("ActivationCapturer class defined with kwargs support for Gemma/LLaMA/Mistral")

In [None]:
# Cell 5: Run Analysis on a Model

def analyze_model(model_key):
    """Run full analysis on a model."""
    
    # Load model
    model, tokenizer, info = load_model(model_key)
    if model is None:
        return None
    
    # Setup hooks
    capturer = ActivationCapturer(model, info)
    capturer.register_hooks()
    
    # Test prompt
    prompt = "The capital of France is"
    inputs = tokenizer(prompt, return_tensors='pt')
    
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    
    # Forward pass
    print(f"  Running forward pass...")
    with torch.no_grad():
        _ = model(**inputs)
    
    # Extract gains
    attn_gains, mlp_gains = capturer.get_gains()
    
    # Check if we got valid data
    if not attn_gains or not mlp_gains:
        print(f"  Warning: No gains captured! attn={len(attn_gains)}, mlp={len(mlp_gains)}")
        capturer.remove_hooks()
        del model
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        return None
    
    # Compute statistics
    n_layers = len(attn_gains)
    attn_contracting = sum(1 for g in attn_gains if g < 1.0)
    mlp_contracting = sum(1 for g in mlp_gains if g < 1.0)
    
    # Combined gains
    combined_gains = [a * m for a, m in zip(attn_gains, mlp_gains)]
    combined_contracting = sum(1 for g in combined_gains if g < 1.0)
    
    # Explicit Python type conversion for all values
    results = {
        'model': str(model_key),
        'family': str(info['family']),
        'params': float(info['params']),
        'n_layers': int(n_layers),
        'norm_type': str(info['norm']),
        'mlp_type': str(info['mlp']),
        'attn_gains': [float(g) for g in attn_gains],
        'mlp_gains': [float(g) for g in mlp_gains],
        'combined_gains': [float(g) for g in combined_gains],
        'statistics': {
            'attn_contracting_pct': float(100 * attn_contracting / n_layers),
            'mlp_contracting_pct': float(100 * mlp_contracting / n_layers),
            'combined_contracting_pct': float(100 * combined_contracting / n_layers),
            'attn_min': float(min(attn_gains)),
            'attn_max': float(max(attn_gains)),
            'mlp_min': float(min(mlp_gains)),
            'mlp_max': float(max(mlp_gains)),
            'last_mlp_gain': float(mlp_gains[-1]),
            'last_attn_gain': float(attn_gains[-1]),
            'last_combined_gain': float(combined_gains[-1])
        },
        'universal_tests': {
            'attention_always_contracts': True if attn_contracting == n_layers else False,
            'attention_mostly_contracts': True if (attn_contracting / n_layers) >= 0.95 else False,
            'last_mlp_expands': True if mlp_gains[-1] > 1.0 else False,
            'last_layer_net_expands': True if combined_gains[-1] > 1.0 else False
        }
    }
    
    # Print summary
    print(f"\n  === {model_key} Results ===")
    print(f"  Attention contracting: {attn_contracting}/{n_layers} ({100*attn_contracting/n_layers:.1f}%)")
    print(f"  MLP contracting: {mlp_contracting}/{n_layers} ({100*mlp_contracting/n_layers:.1f}%)")
    print(f"  Last layer MLP gain: {mlp_gains[-1]:.2f}x")
    print(f"  Last layer combined: {combined_gains[-1]:.2f}x")
    
    # Cleanup
    capturer.remove_hooks()
    del model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    return results

In [None]:
# Cell 6: Run Analysis on All Selected Models

all_results = {}

for model_key in selected_models:
    print(f"\n{'='*60}")
    print(f"Analyzing {model_key}...")
    print(f"{'='*60}")
    
    results = analyze_model(model_key)
    if results is not None:
        all_results[model_key] = results

print(f"\n\nCompleted analysis of {len(all_results)} models.")

In [None]:
# Cell 7: Load Pythia Reference Data for Comparison

# Pythia scaling law reference points
PYTHIA_REFERENCE = {
    'scaling_law': {
        'coefficient': 0.013,
        'exponent': 0.265,
        'exponent_std': 0.079
    },
    'models': {
        'pythia-70m': {'params': 70e6, 'last_mlp_gain': 1.50, 'attn_contract_pct': 100, 'mlp_contract_pct': 83},
        'pythia-160m': {'params': 160e6, 'last_mlp_gain': 2.82, 'attn_contract_pct': 100, 'mlp_contract_pct': 75},
        'pythia-410m': {'params': 410e6, 'last_mlp_gain': 1.78, 'attn_contract_pct': 100, 'mlp_contract_pct': 88},
        'pythia-1b': {'params': 1e9, 'last_mlp_gain': 3.72, 'attn_contract_pct': 100, 'mlp_contract_pct': 69},
        'pythia-1.4b': {'params': 1.4e9, 'last_mlp_gain': 3.52, 'attn_contract_pct': 100, 'mlp_contract_pct': 92},
        'pythia-2.8b': {'params': 2.8e9, 'last_mlp_gain': 2.10, 'attn_contract_pct': 100, 'mlp_contract_pct': 75},
        'pythia-6.9b': {'params': 6.9e9, 'last_mlp_gain': 6.30, 'attn_contract_pct': 97, 'mlp_contract_pct': 56},
        'pythia-12b': {'params': 12e9, 'last_mlp_gain': 7.71, 'attn_contract_pct': 97, 'mlp_contract_pct': 6}
    }
}

def predict_gain(params):
    """Predict final MLP gain using Pythia scaling law."""
    coef = PYTHIA_REFERENCE['scaling_law']['coefficient']
    exp = PYTHIA_REFERENCE['scaling_law']['exponent']
    return coef * (params ** exp)

print("Pythia Scaling Law: Final_MLP_Gain = 0.013 × Params^0.265")
print("\nPredictions for tested models:")
for model_key in all_results:
    params = all_results[model_key]['params']
    predicted = predict_gain(params)
    actual = all_results[model_key]['statistics']['last_mlp_gain']
    ratio = actual / predicted
    print(f"  {model_key}: predicted={predicted:.2f}x, actual={actual:.2f}x, ratio={ratio:.2f}")

In [None]:
# Cell 8: Visualization - Compare Architectures

import os

# Create Results directory if it doesn't exist (for Colab)
results_dir = '../Results'
if not os.path.exists(results_dir):
    os.makedirs(results_dir, exist_ok=True)
    print(f"Created directory: {results_dir}")

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Colors for different families
FAMILY_COLORS = {
    'pythia': 'blue',
    'gemma': 'red',
    'llama': 'green',
    'mistral': 'purple',
    'phi': 'orange',
    'stablelm': 'brown'
}

# Panel 1: Attention Gains per Layer
ax1 = axes[0, 0]
for model_key, results in all_results.items():
    color = FAMILY_COLORS.get(results['family'], 'gray')
    layers = range(results['n_layers'])
    ax1.plot(layers, results['attn_gains'], 'o-', label=model_key, color=color, alpha=0.7)
ax1.axhline(y=1.0, color='black', linestyle='--', label='Gain=1')
ax1.set_xlabel('Layer')
ax1.set_ylabel('Attention Gain')
ax1.set_title('Attention Gains per Layer')
ax1.legend(loc='upper right')
ax1.set_ylim(0, 1.5)

# Panel 2: MLP Gains per Layer
ax2 = axes[0, 1]
for model_key, results in all_results.items():
    color = FAMILY_COLORS.get(results['family'], 'gray')
    layers = range(results['n_layers'])
    ax2.plot(layers, results['mlp_gains'], 's-', label=model_key, color=color, alpha=0.7)
ax2.axhline(y=1.0, color='black', linestyle='--', label='Gain=1')
ax2.set_xlabel('Layer')
ax2.set_ylabel('MLP Gain')
ax2.set_title('MLP Gains per Layer')
ax2.legend(loc='upper left')

# Panel 3: Scaling Law Comparison
ax3 = axes[1, 0]

# Plot Pythia reference points
pythia_params = [v['params'] for v in PYTHIA_REFERENCE['models'].values()]
pythia_gains = [v['last_mlp_gain'] for v in PYTHIA_REFERENCE['models'].values()]
ax3.scatter(pythia_params, pythia_gains, c='blue', s=100, label='Pythia (reference)', alpha=0.7, marker='o')

# Plot tested models
for model_key, results in all_results.items():
    color = FAMILY_COLORS.get(results['family'], 'gray')
    ax3.scatter(results['params'], results['statistics']['last_mlp_gain'], 
                c=color, s=150, label=model_key, marker='*', edgecolors='black')

# Plot scaling law prediction
params_range = np.logspace(7, 11, 100)
predicted_gains = [predict_gain(p) for p in params_range]
ax3.plot(params_range, predicted_gains, 'b--', label='Pythia Scaling Law', alpha=0.5)

ax3.set_xscale('log')
ax3.set_yscale('log')
ax3.set_xlabel('Parameters')
ax3.set_ylabel('Final MLP Gain')
ax3.set_title('Scaling Law: Final MLP Gain vs Parameters')
ax3.legend(loc='upper left')
ax3.grid(True, alpha=0.3)

# Panel 4: Summary Table
ax4 = axes[1, 1]
ax4.axis('off')

# Create summary table
table_data = []
headers = ['Model', 'Family', 'Params', 'Attn<1', 'MLP<1', 'Last MLP', 'Predicted', 'Ratio']

for model_key, results in all_results.items():
    params = results['params']
    predicted = predict_gain(params)
    actual = results['statistics']['last_mlp_gain']
    ratio = actual / predicted
    
    table_data.append([
        model_key,
        results['family'],
        f"{params/1e9:.1f}B",
        f"{results['statistics']['attn_contracting_pct']:.0f}%",
        f"{results['statistics']['mlp_contracting_pct']:.0f}%",
        f"{actual:.2f}x",
        f"{predicted:.2f}x",
        f"{ratio:.2f}"
    ])

table = ax4.table(cellText=table_data, colLabels=headers, loc='center', cellLoc='center')
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1.2, 1.5)
ax4.set_title('Cross-Architecture Comparison', fontsize=12, fontweight='bold')

plt.tight_layout()

# Save with error handling
output_png = f'{results_dir}/cross_architecture_validation.png'
try:
    plt.savefig(output_png, dpi=150, bbox_inches='tight')
    print(f"Saved: {output_png}")
except Exception as e:
    # Fallback to current directory
    output_png = 'cross_architecture_validation.png'
    plt.savefig(output_png, dpi=150, bbox_inches='tight')
    print(f"Saved (fallback): {output_png}")

plt.show()

In [None]:
# Cell 9: Universal Principles Validation

print("="*60)
print("UNIVERSAL PRINCIPLES VALIDATION")
print("="*60)

# Test 1: Attention Always Contracts
print("\n1. ATTENTION ALWAYS CONTRACTS")
print("-" * 40)
all_attn_contracts = True
for model_key, results in all_results.items():
    pct = results['statistics']['attn_contracting_pct']
    status = "✅" if pct >= 95 else "❌"
    print(f"   {model_key}: {pct:.1f}% contracting {status}")
    if pct < 95:
        all_attn_contracts = False
print(f"   UNIVERSAL: {'✅ YES' if all_attn_contracts else '❌ NO'}")

# Test 2: Last Layer MLP Expands
print("\n2. LAST LAYER MLP EXPANDS")
print("-" * 40)
all_last_expands = True
for model_key, results in all_results.items():
    gain = results['statistics']['last_mlp_gain']
    status = "✅" if gain > 1.0 else "❌"
    print(f"   {model_key}: {gain:.2f}x {status}")
    if gain <= 1.0:
        all_last_expands = False
print(f"   UNIVERSAL: {'✅ YES' if all_last_expands else '❌ NO'}")

# Test 3: Scaling Law Holds
print("\n3. SCALING LAW CONSISTENCY")
print("-" * 40)
print("   Reference: Final_MLP_Gain = 0.013 × Params^0.265")
ratios = []
for model_key, results in all_results.items():
    params = results['params']
    predicted = predict_gain(params)
    actual = results['statistics']['last_mlp_gain']
    ratio = actual / predicted
    ratios.append(ratio)
    status = "✅" if 0.5 <= ratio <= 2.0 else "⚠️"
    print(f"   {model_key}: actual/predicted = {ratio:.2f} {status}")

mean_ratio = np.mean(ratios)
std_ratio = np.std(ratios)
print(f"\n   Mean ratio: {mean_ratio:.2f} ± {std_ratio:.2f}")
print(f"   SCALING LAW: {'✅ HOLDS' if 0.5 <= mean_ratio <= 2.0 else '⚠️ DEVIATION'}")

In [None]:
# Cell 10: Architecture Pattern Classification

print("="*60)
print("ARCHITECTURE PATTERN CLASSIFICATION")
print("="*60)

def classify_architecture(mlp_gains):
    """Classify architecture as FUNNEL, HOUR-GLASS, or VASE."""
    n = len(mlp_gains)
    
    # Split into thirds
    early = mlp_gains[:n//3]
    middle = mlp_gains[n//3:2*n//3]
    late = mlp_gains[2*n//3:-1]  # Exclude last
    last = mlp_gains[-1]
    
    # Calculate contraction percentages
    early_contract = sum(1 for g in early if g < 1.0) / len(early) if early else 0
    middle_contract = sum(1 for g in middle if g < 1.0) / len(middle) if middle else 0
    late_contract = sum(1 for g in late if g < 1.0) / len(late) if late else 0
    
    # Classification logic
    if early_contract > 0.6 and middle_contract > 0.6:
        return 'FUNNEL', (early_contract, middle_contract, late_contract)
    elif early_contract < 0.4 and late_contract > 0.4:
        return 'HOUR-GLASS', (early_contract, middle_contract, late_contract)
    elif early_contract < 0.4 and middle_contract < 0.5 and late_contract < 0.5:
        return 'VASE', (early_contract, middle_contract, late_contract)
    else:
        return 'TRANSITIONAL', (early_contract, middle_contract, late_contract)

print("\nPattern definitions:")
print("  FUNNEL:      Compress throughout, small explosion")
print("  HOUR-GLASS:  Expand early, compress middle, explode end")
print("  VASE:        Expand everywhere, massive explosion")
print()

for model_key, results in all_results.items():
    pattern, (e, m, l) = classify_architecture(results['mlp_gains'])
    last_gain = results['statistics']['last_mlp_gain']
    print(f"{model_key}:")
    print(f"  Pattern: {pattern}")
    print(f"  Early contract: {100*e:.0f}%, Middle: {100*m:.0f}%, Late: {100*l:.0f}%")
    print(f"  Final explosion: {last_gain:.2f}x")
    print()

In [None]:
# Cell 11: Save Results

import os

# Ensure Results directory exists
results_dir = '../Results'
if not os.path.exists(results_dir):
    os.makedirs(results_dir, exist_ok=True)

# Helper function to convert numpy types to Python types
def to_python_type(obj):
    """Recursively convert numpy types to Python native types."""
    if isinstance(obj, dict):
        return {k: to_python_type(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [to_python_type(v) for v in obj]
    elif isinstance(obj, (np.bool_, np.generic)):
        return obj.item()
    elif isinstance(obj, bool):
        return bool(obj)
    elif isinstance(obj, (int, float, str, type(None))):
        return obj
    else:
        return str(obj)

# Compute universal principles with explicit Python bool conversion
attn_contracts = all([r['statistics']['attn_contracting_pct'] >= 95 for r in all_results.values()])
last_mlp_expands = all([r['statistics']['last_mlp_gain'] > 1.0 for r in all_results.values()])
scaling_consistent = True if (0.5 <= mean_ratio <= 2.0) else False if 'mean_ratio' in dir() else None

# Prepare output with explicit type conversion
output = {
    'experiment': 'Cross-Architecture Validation',
    'date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'pythia_reference': PYTHIA_REFERENCE,
    'models_tested': list(all_results.keys()),
    'results': to_python_type(all_results),
    'universal_principles': {
        'attention_contracts': True if attn_contracts else False,
        'last_mlp_expands': True if last_mlp_expands else False,
        'scaling_law_consistent': scaling_consistent
    },
    'conclusions': {
        'attention_universal': 'Attention contraction appears universal across architectures',
        'last_layer_universal': 'Final MLP expansion appears universal across architectures',
        'scaling_law_cross_arch': f'Mean ratio to Pythia prediction: {mean_ratio:.2f}' if 'mean_ratio' in dir() else 'N/A'
    }
}

# Save JSON with error handling
output_json = f'{results_dir}/cross_architecture_validation_results.json'
output_png = f'{results_dir}/cross_architecture_validation.png'

try:
    with open(output_json, 'w') as f:
        json.dump(output, f, indent=2)
    print(f"Saved: {output_json}")
except Exception as e:
    print(f"Error saving to {output_json}: {e}")
    # Fallback to current directory
    output_json = 'cross_architecture_validation_results.json'
    try:
        with open(output_json, 'w') as f:
            json.dump(output, f, indent=2)
        print(f"Saved (fallback): {output_json}")
    except Exception as e2:
        print(f"Fallback also failed: {e2}")
    output_png = 'cross_architecture_validation.png'

# Auto-download for Colab
try:
    from google.colab import files
    files.download(output_json)
    if os.path.exists(output_png):
        files.download(output_png)
    print("\nFiles downloaded automatically!")
except ImportError:
    print("\nNot in Colab - files saved locally")
except Exception as e:
    print(f"\nDownload failed: {e}")

In [None]:
# Cell 12: Final Summary

print("="*60)
print("CROSS-ARCHITECTURE VALIDATION SUMMARY")
print("="*60)

print(f"\nModels tested: {len(all_results)}")
for m in all_results:
    print(f"  - {m} ({all_results[m]['family']})")

print(f"\n" + "="*60)
print("KEY FINDINGS")
print("="*60)

print("\n1. ATTENTION CONTRACTION:")
if all([r['statistics']['attn_contracting_pct'] >= 95 for r in all_results.values()]):
    print("   ✅ UNIVERSAL - All tested architectures show >95% attention contraction")
else:
    print("   ⚠️ NOT UNIVERSAL - Some architectures deviate")

print("\n2. FINAL MLP EXPLOSION:")
if all([r['statistics']['last_mlp_gain'] > 1.0 for r in all_results.values()]):
    print("   ✅ UNIVERSAL - All tested architectures show last layer MLP gain > 1")
else:
    print("   ⚠️ NOT UNIVERSAL - Some architectures deviate")

print("\n3. SCALING LAW CROSS-ARCHITECTURE:")
if 'mean_ratio' in dir():
    if 0.5 <= mean_ratio <= 2.0:
        print(f"   ✅ CONSISTENT - Mean ratio to Pythia prediction: {mean_ratio:.2f}x")
    else:
        print(f"   ⚠️ DEVIATION - Mean ratio: {mean_ratio:.2f}x (expected ~1.0)")

print("\n4. ARCHITECTURE PATTERNS:")
for model_key, results in all_results.items():
    pattern, _ = classify_architecture(results['mlp_gains'])
    print(f"   {model_key}: {pattern}")

print("\n" + "="*60)
print("IMPLICATIONS FOR PAPER #3")
print("="*60)
print("""
If universal principles hold across architectures:
  → Attention contraction is INTRINSIC to the attention mechanism
  → Final MLP explosion is REQUIRED for token prediction
  → The scaling law reflects fundamental capacity-decision tradeoff
  → Architecture differences (LayerNorm vs RMSNorm, GeGLU vs SwiGLU)
     affect MAGNITUDE but not PATTERN

If deviations exist:
  → Document which principle fails and why
  → May reveal architecture-specific optimizations
  → Could inform better architecture design
""")