# LLaMA 2 vs LLaMA 3.1: Long-Context Hypothesis Test

**Date:** 2026-01-05
**Goal:** Prove that long-context causes contraction

## Hypothesis

```
If Long-Context Dampening is correct:

LLaMA 2 (4k context, RoPE θ=10,000)   → Should EXPAND (like Mistral)
LLaMA 3.1 (128k context, RoPE θ=500,000) → CONTRACTS (confirmed)

If LLaMA 2 EXPANDS → SMOKING GUN for Long-Context Hypothesis!
If LLaMA 2 CONTRACTS → Other factor (family-specific?)
```

In [None]:
# Cell 0: HuggingFace Login
from huggingface_hub import login

# Login for gated models
login()

print("HuggingFace login complete!")

In [None]:
# Cell 1: Setup
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import matplotlib.pyplot as plt
import json
import os

# Results directory
RESULTS_DIR = './Results'
os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
print("Setup complete")

In [None]:
# Cell 2: Model Definitions

MODELS_TO_TEST = {
    'llama2-7b': {
        'hf_name': 'meta-llama/Llama-2-7b-hf',
        'context_length': 4096,
        'expected_rope_theta': 10000,
        'prediction': 'EXPAND (like Mistral)',
        'family': 'LLaMA 2'
    },
    'llama3.1-8b': {
        'hf_name': 'meta-llama/Llama-3.1-8B',
        'context_length': 128000,
        'expected_rope_theta': 500000,
        'prediction': 'CONTRACT (confirmed: 0.48x)',
        'family': 'LLaMA 3.1'
    }
}

# Reference values from previous experiments
REFERENCE = {
    'llama3.1-8b': {
        'last_gain': 0.48,
        'w_u_spectral': 94.61,
        'rope_theta': 500000,
        'behavior': 'CONTRACTS'
    },
    'mistral-7b': {
        'last_gain': 1.37,
        'w_u_spectral': 16.14,
        'rope_theta': 10000,
        'behavior': 'EXPANDS'
    }
}

print("Models to test:")
for name, info in MODELS_TO_TEST.items():
    print(f"  {name}: context={info['context_length']:,}, prediction={info['prediction']}")

In [None]:
# Cell 3: Residual Stream Analyzer

class ResidualStreamAnalyzer:
    """Analyze residual stream dynamics layer by layer."""
    
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = next(model.parameters()).device
        
    def get_residual_norms(self, text="The quick brown fox jumps over the lazy dog."):
        """Get L2 norm of residual stream at each layer."""
        inputs = self.tokenizer(text, return_tensors='pt').to(self.device)
        
        norms = []
        
        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states=True)
            hidden_states = outputs.hidden_states
            
            for i, hs in enumerate(hidden_states):
                # Mean norm across sequence positions
                norm = hs.float().norm(dim=-1).mean().item()
                norms.append(norm)
        
        return norms
    
    def compute_gains(self, norms):
        """Compute layer-wise gains from norms."""
        gains = []
        for i in range(1, len(norms)):
            gain = norms[i] / norms[i-1] if norms[i-1] > 0 else 1.0
            gains.append(gain)
        return gains
    
    def analyze(self, text="The quick brown fox jumps over the lazy dog."):
        """Full analysis of residual stream."""
        norms = self.get_residual_norms(text)
        gains = self.compute_gains(norms)
        
        # Compute statistics
        cumulative = 1.0
        for g in gains:
            cumulative *= g
        
        results = {
            'norms': norms,
            'gains': gains,
            'embedding_norm': norms[0],
            'final_norm': norms[-1],
            'initial_gain': gains[0] if gains else 1.0,
            'last_gain': gains[-1] if gains else 1.0,
            'last_expands': gains[-1] > 1.0 if gains else False,
            'cumulative_energy': cumulative,
            'num_layers': len(gains),
            'contracting_layers': sum(1 for g in gains if g < 1.0),
            'expanding_layers': sum(1 for g in gains if g >= 1.0)
        }
        
        return results

print("ResidualStreamAnalyzer defined")

In [None]:
# Cell 4: W_U Analyzer

def analyze_unembedding(model):
    """Analyze the unembedding matrix (lm_head)."""
    lm_head = model.lm_head.weight.data.float()
    
    results = {
        'shape': list(lm_head.shape),
        'vocab_size': lm_head.shape[0],
        'hidden_dim': lm_head.shape[1],
        'frobenius_norm': torch.linalg.norm(lm_head, ord='fro').item(),
        'spectral_norm': torch.linalg.norm(lm_head, ord=2).item(),
        'mean_row_norm': torch.linalg.norm(lm_head, dim=1).mean().item()
    }
    
    return results

def get_rope_theta(config):
    """Extract RoPE theta from config."""
    rope_theta = getattr(config, 'rope_theta', None)
    rope_scaling = getattr(config, 'rope_scaling', None)
    max_position = getattr(config, 'max_position_embeddings', None)
    
    return {
        'rope_theta': rope_theta,
        'rope_scaling': rope_scaling,
        'max_position': max_position
    }

print("Analyzers defined")

In [None]:
# Cell 5: Test LLaMA 2

print("="*70)
print("TESTING: LLaMA 2 (7B)")
print("="*70)
print("\nPrediction: Should EXPAND if Long-Context Hypothesis is correct")
print("            (4k context, standard RoPE theta ~10,000)\n")

model_name = 'llama2-7b'
model_info = MODELS_TO_TEST[model_name]

# Load model
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_info['hf_name'],
    torch_dtype=torch.float16,
    device_map='auto',
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_info['hf_name'])
config = AutoConfig.from_pretrained(model_info['hf_name'])

print(f"Model loaded: {model_info['hf_name']}")
print(f"Layers: {config.num_hidden_layers}")

# Residual stream analysis
print("\nAnalyzing residual stream...")
analyzer = ResidualStreamAnalyzer(model, tokenizer)
residual_results = analyzer.analyze()

# W_U analysis
print("Analyzing W_U (unembedding)...")
wu_results = analyze_unembedding(model)

# RoPE analysis
rope_results = get_rope_theta(config)

# Combine results
llama2_results = {
    'model': model_name,
    'hf_name': model_info['hf_name'],
    'num_layers': config.num_hidden_layers,
    'residual': residual_results,
    'w_u': wu_results,
    'rope': rope_results
}

# Print key results
print("\n" + "="*70)
print("LLaMA 2 RESULTS")
print("="*70)
print(f"\nResidual Stream:")
print(f"  Embedding Norm: {residual_results['embedding_norm']:.2f}")
print(f"  Final Norm: {residual_results['final_norm']:.2f}")
print(f"  Initial Gain: {residual_results['initial_gain']:.2f}x")
print(f"  Last Gain: {residual_results['last_gain']:.4f}x")
print(f"  Last Expands: {residual_results['last_expands']}")
print(f"  Cumulative Energy: {residual_results['cumulative_energy']:.2f}")

print(f"\nW_U (Unembedding):")
print(f"  Shape: {wu_results['shape']}")
print(f"  Spectral Norm: {wu_results['spectral_norm']:.2f}")

print(f"\nRoPE:")
print(f"  Theta: {rope_results['rope_theta']}")
print(f"  Scaling: {rope_results['rope_scaling']}")
print(f"  Max Position: {rope_results['max_position']}")

# Clean up
del model
torch.cuda.empty_cache()

print("\nLLaMA 2 analysis complete!")

In [None]:
# Cell 6: Test LLaMA 3.1 (for direct comparison)

print("="*70)
print("TESTING: LLaMA 3.1 (8B)")
print("="*70)
print("\nExpected: CONTRACTS (confirmed: 0.48x)")
print("          (128k context, RoPE theta 500,000)\n")

model_name = 'llama3.1-8b'
model_info = MODELS_TO_TEST[model_name]

# Load model
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_info['hf_name'],
    torch_dtype=torch.float16,
    device_map='auto',
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_info['hf_name'])
config = AutoConfig.from_pretrained(model_info['hf_name'])

print(f"Model loaded: {model_info['hf_name']}")
print(f"Layers: {config.num_hidden_layers}")

# Residual stream analysis
print("\nAnalyzing residual stream...")
analyzer = ResidualStreamAnalyzer(model, tokenizer)
residual_results = analyzer.analyze()

# W_U analysis
print("Analyzing W_U (unembedding)...")
wu_results = analyze_unembedding(model)

# RoPE analysis
rope_results = get_rope_theta(config)

# Combine results
llama31_results = {
    'model': model_name,
    'hf_name': model_info['hf_name'],
    'num_layers': config.num_hidden_layers,
    'residual': residual_results,
    'w_u': wu_results,
    'rope': rope_results
}

# Print key results
print("\n" + "="*70)
print("LLaMA 3.1 RESULTS")
print("="*70)
print(f"\nResidual Stream:")
print(f"  Embedding Norm: {residual_results['embedding_norm']:.2f}")
print(f"  Final Norm: {residual_results['final_norm']:.2f}")
print(f"  Initial Gain: {residual_results['initial_gain']:.2f}x")
print(f"  Last Gain: {residual_results['last_gain']:.4f}x")
print(f"  Last Expands: {residual_results['last_expands']}")
print(f"  Cumulative Energy: {residual_results['cumulative_energy']:.2f}")

print(f"\nW_U (Unembedding):")
print(f"  Shape: {wu_results['shape']}")
print(f"  Spectral Norm: {wu_results['spectral_norm']:.2f}")

print(f"\nRoPE:")
print(f"  Theta: {rope_results['rope_theta']}")
print(f"  Scaling: {rope_results['rope_scaling']}")
print(f"  Max Position: {rope_results['max_position']}")

# Clean up
del model
torch.cuda.empty_cache()

print("\nLLaMA 3.1 analysis complete!")

In [None]:
# Cell 7: HYPOTHESIS TEST - The Critical Comparison

print("\n" + "="*70)
print("LONG-CONTEXT HYPOTHESIS TEST")
print("="*70)

print("\n" + "-"*70)
print("COMPARISON TABLE")
print("-"*70)

print("\n| Model | Context | RoPE Theta | Last Gain | W_U σ_max | Behavior |")
print("|-------|---------|------------|-----------|-----------|----------|")

# LLaMA 2
l2_ctx = MODELS_TO_TEST['llama2-7b']['context_length']
l2_theta = llama2_results['rope']['rope_theta']
l2_gain = llama2_results['residual']['last_gain']
l2_wu = llama2_results['w_u']['spectral_norm']
l2_behavior = "EXPANDS" if l2_gain > 1.0 else "CONTRACTS"
print(f"| LLaMA 2 | {l2_ctx:,} | {l2_theta:,} | {l2_gain:.4f}x | {l2_wu:.2f} | {l2_behavior} |")

# LLaMA 3.1
l3_ctx = MODELS_TO_TEST['llama3.1-8b']['context_length']
l3_theta = llama31_results['rope']['rope_theta']
l3_gain = llama31_results['residual']['last_gain']
l3_wu = llama31_results['w_u']['spectral_norm']
l3_behavior = "EXPANDS" if l3_gain > 1.0 else "CONTRACTS"
print(f"| LLaMA 3.1 | {l3_ctx:,} | {l3_theta:,.0f} | {l3_gain:.4f}x | {l3_wu:.2f} | {l3_behavior} |")

# Mistral reference
print(f"| Mistral (ref) | 8,192 | 10,000 | 1.37x | 16.14 | EXPANDS |")

print("\n" + "-"*70)
print("HYPOTHESIS EVALUATION")
print("-"*70)

# Compute ratios
theta_ratio = l3_theta / l2_theta if l2_theta else 0
gain_ratio = l3_gain / l2_gain if l2_gain else 0
wu_ratio = l3_wu / l2_wu if l2_wu else 0

print(f"\nRoPE Theta Ratio (LLaMA 3.1 / LLaMA 2): {theta_ratio:.1f}x")
print(f"Last Gain Ratio (LLaMA 3.1 / LLaMA 2): {gain_ratio:.2f}")
print(f"W_U Ratio (LLaMA 3.1 / LLaMA 2): {wu_ratio:.2f}x")

print("\n" + "="*70)

# Determine verdict
if l2_gain > 1.0 and l3_gain < 1.0:
    print("VERDICT: HYPOTHESIS CONFIRMED!")
    print("="*70)
    print("\nLLaMA 2 (short context) EXPANDS")
    print("LLaMA 3.1 (long context) CONTRACTS")
    print("\n--> Long-context causes contraction!")
    print("--> This is DELIBERATE ENGINEERING for 128k stability!")
    verdict = "CONFIRMED"
elif l2_gain < 1.0 and l3_gain < 1.0:
    print("VERDICT: HYPOTHESIS REJECTED")
    print("="*70)
    print("\nBOTH LLaMA 2 and LLaMA 3.1 CONTRACT")
    print("\n--> Contraction is LLaMA FAMILY trait, not context-dependent!")
    print("--> Need to investigate LLaMA-specific architecture")
    verdict = "REJECTED - Family Trait"
elif l2_gain > 1.0 and l3_gain > 1.0:
    print("VERDICT: UNEXPECTED - Both Expand")
    print("="*70)
    print("\nBOTH models EXPAND - contradicts previous LLaMA 3.1 finding")
    print("\n--> Check for measurement error")
    verdict = "UNEXPECTED"
else:
    print("VERDICT: INVERTED")
    print("="*70)
    print("\nLLaMA 2 CONTRACTS, LLaMA 3.1 EXPANDS - opposite of hypothesis")
    verdict = "INVERTED"

# Store verdict
hypothesis_test = {
    'llama2_expands': l2_gain > 1.0,
    'llama31_expands': l3_gain > 1.0,
    'theta_ratio': theta_ratio,
    'gain_ratio': gain_ratio,
    'wu_ratio': wu_ratio,
    'verdict': verdict
}

In [None]:
# Cell 8: Visualization

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Colors
colors = {'llama2': '#3498db', 'llama3.1': '#e74c3c', 'mistral': '#2ecc71'}

# Plot 1: Last Layer Gain Comparison
ax1 = axes[0, 0]
models = ['LLaMA 2\n(4k ctx)', 'LLaMA 3.1\n(128k ctx)', 'Mistral\n(8k ctx)']
gains = [l2_gain, l3_gain, 1.37]
bars = ax1.bar(models, gains, color=[colors['llama2'], colors['llama3.1'], colors['mistral']])
ax1.axhline(y=1.0, color='gray', linestyle='--', alpha=0.7, label='Expansion Threshold')
ax1.set_ylabel('Last Layer Gain')
ax1.set_title('Last Layer Gain Comparison\n(>1 = Expand, <1 = Contract)')
for bar, val in zip(bars, gains):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
             f'{val:.2f}x', ha='center', va='bottom', fontsize=12, fontweight='bold')
ax1.legend()

# Plot 2: RoPE Theta Comparison
ax2 = axes[0, 1]
thetas = [l2_theta, l3_theta, 10000]
bars = ax2.bar(models, thetas, color=[colors['llama2'], colors['llama3.1'], colors['mistral']])
ax2.set_ylabel('RoPE Theta')
ax2.set_title('RoPE Theta Comparison\n(Higher = Longer Context Support)')
ax2.set_yscale('log')
for bar, val in zip(bars, thetas):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() * 1.1, 
             f'{val:,.0f}', ha='center', va='bottom', fontsize=10)

# Plot 3: W_U Spectral Norm
ax3 = axes[1, 0]
wu_norms = [l2_wu, l3_wu, 16.14]
bars = ax3.bar(models, wu_norms, color=[colors['llama2'], colors['llama3.1'], colors['mistral']])
ax3.set_ylabel('W_U Spectral Norm (σ_max)')
ax3.set_title('Unembedding Matrix Spectral Norm\n(Static Amplification Factor)')
for bar, val in zip(bars, wu_norms):
    ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
             f'{val:.1f}', ha='center', va='bottom', fontsize=12, fontweight='bold')

# Plot 4: Layer-wise Gains
ax4 = axes[1, 1]
ax4.plot(llama2_results['residual']['gains'], 'b-', label='LLaMA 2', linewidth=2)
ax4.plot(llama31_results['residual']['gains'], 'r-', label='LLaMA 3.1', linewidth=2)
ax4.axhline(y=1.0, color='gray', linestyle='--', alpha=0.7)
ax4.set_xlabel('Layer')
ax4.set_ylabel('Gain')
ax4.set_title('Layer-wise Gains\n(Last layer highlighted)')
ax4.legend()
# Highlight last layer
ax4.axvline(x=len(llama2_results['residual']['gains'])-1, color='blue', linestyle=':', alpha=0.5)
ax4.axvline(x=len(llama31_results['residual']['gains'])-1, color='red', linestyle=':', alpha=0.5)

plt.tight_layout()
plt.savefig(f'{RESULTS_DIR}/llama2_vs_llama31_hypothesis_test.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nVisualization saved to {RESULTS_DIR}/llama2_vs_llama31_hypothesis_test.png")

In [None]:
# Cell 9: Save Results

final_results = {
    'experiment': 'LLaMA 2 vs LLaMA 3.1 - Long-Context Hypothesis Test',
    'date': '2026-01-05',
    'hypothesis': 'Long-context models require dampening (contraction) for numerical stability',
    'models': {
        'llama2-7b': llama2_results,
        'llama3.1-8b': llama31_results
    },
    'hypothesis_test': hypothesis_test,
    'references': REFERENCE
}

output_path = f'{RESULTS_DIR}/llama2_vs_llama31_hypothesis_test.json'
with open(output_path, 'w') as f:
    json.dump(final_results, f, indent=2, default=str)

print(f"Results saved to {output_path}")
print("\n" + "="*70)
print("EXPERIMENT COMPLETE")
print("="*70)

In [None]:
# Cell 10: Download Results

from google.colab import files

print("="*70)
print("DOWNLOADING RESULTS...")
print("="*70)

# Download JSON
json_path = f'{RESULTS_DIR}/llama2_vs_llama31_hypothesis_test.json'
if os.path.exists(json_path):
    print(f"\nDownloading: {json_path}")
    files.download(json_path)

# Download PNG
png_path = f'{RESULTS_DIR}/llama2_vs_llama31_hypothesis_test.png'
if os.path.exists(png_path):
    print(f"Downloading: {png_path}")
    files.download(png_path)

print("\n" + "="*70)
print("DOWNLOAD COMPLETE!")
print("="*70)

## Summary

### Hypothesis
Long-context models (128k) require dampening (contraction) for numerical stability.

### Prediction
- LLaMA 2 (4k context) → Should EXPAND
- LLaMA 3.1 (128k context) → Should CONTRACT (confirmed)

### If CONFIRMED
This proves that contraction is a **deliberate engineering choice** for long-context stability, not a model weakness.

### If REJECTED
Contraction is a **LLaMA family trait**, independent of context length. Would need to investigate LLaMA-specific architecture.