# Mistral Paradox Investigation

**Problem:** Mistral-7B zeigt Last MLP Gain = 0.58x (CONTRACTS statt EXPANDS)

**Hypothesen:**
1. **Silent Exit:** Explosion versteckt in Unembedding Matrix W_U
2. **Architektur vs Training:** Apertus-8B (verwandt) zeigt Explosion bei L28?
3. **Entropy Efficiency:** Mistral braucht keine Explosion weil Output bereits scharf

**Tests:**
1. Unembedding Norm Vergleich (Mistral vs Pythia vs Gemma)
2. Apertus-8B FFN Expansion Analysis
3. Output Logit Entropy vor Softmax

In [None]:
# Cell 1: Imports and Setup
import torch
import numpy as np
import json
from datetime import datetime
import matplotlib.pyplot as plt
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Check GPU
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {gpu_name}")
    print(f"GPU Memory: {gpu_mem:.1f} GB")
else:
    gpu_name = "CPU"
    gpu_mem = 0
    print("Running on CPU")

In [None]:
# Cell 2: Model Definitions

MODELS_TO_TEST = {
    'mistral-7b': {
        'hf_name': 'mistralai/Mistral-7B-v0.1',
        'params': 7e9,
        'layers': 32,
        'family': 'mistral',
        'memory_gb': 20
    },
    'apertus-8b': {
        'hf_name': 'jphme/Apertus-8b',  # Multilingual model
        'params': 8e9,
        'layers': 32,
        'family': 'llama',
        'memory_gb': 22
    },
    'pythia-6.9b': {
        'hf_name': 'EleutherAI/pythia-6.9b',
        'params': 6.9e9,
        'layers': 32,
        'family': 'pythia',
        'memory_gb': 20
    },
    'gemma-7b': {
        'hf_name': 'google/gemma-7b',
        'params': 7e9,
        'layers': 28,
        'family': 'gemma',
        'memory_gb': 20
    }
}

# Select based on GPU memory
def select_models(mem_gb):
    selected = []
    # Priority: Mistral (the problem) and one comparison
    if mem_gb >= 20:
        selected.append('mistral-7b')
    if mem_gb >= 22:
        selected.append('apertus-8b')
    elif mem_gb >= 20:
        selected.append('pythia-6.9b')
    return selected

selected = select_models(gpu_mem) if torch.cuda.is_available() else ['mistral-7b']
print(f"Models to test: {selected}")

In [None]:
# Cell 3: Load Model Function
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_model(model_key):
    """Load model and return model, tokenizer, and key matrices."""
    info = MODELS_TO_TEST[model_key]
    hf_name = info['hf_name']
    
    print(f"\nLoading {model_key} ({hf_name})...")
    
    dtype = torch.float16 if torch.cuda.is_available() else torch.float32
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(hf_name, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(
            hf_name,
            torch_dtype=dtype,
            device_map='auto' if torch.cuda.is_available() else None,
            trust_remote_code=True
        )
        
        if not torch.cuda.is_available():
            model = model.to(device)
        
        model.eval()
        print(f"  Loaded successfully!")
        return model, tokenizer, info
        
    except Exception as e:
        print(f"  Failed to load: {e}")
        return None, None, None

---
## TEST 1: Unembedding Matrix Norm Analysis

**Hypothesis:** Mistral hides the "explosion" in a statically large W_U (unembedding matrix) instead of dynamically in the final FFN.

**Prediction:** Mistral's W_U has disproportionally higher norm than Pythia/Gemma.

In [None]:
# Cell 4: Extract Unembedding Matrix

def get_unembedding_matrix(model, family):
    """Extract the unembedding (lm_head) matrix from different architectures."""
    
    # Most models: lm_head.weight
    if hasattr(model, 'lm_head'):
        W_U = model.lm_head.weight.data
    elif hasattr(model, 'embed_out'):
        W_U = model.embed_out.weight.data
    else:
        raise ValueError(f"Cannot find unembedding matrix for {family}")
    
    return W_U.float()  # Convert to float32 for analysis

def get_embedding_matrix(model, family):
    """Extract the embedding matrix for comparison."""
    
    if family == 'pythia':
        W_E = model.gpt_neox.embed_in.weight.data
    elif family in ['mistral', 'llama', 'gemma']:
        W_E = model.model.embed_tokens.weight.data
    else:
        W_E = model.get_input_embeddings().weight.data
    
    return W_E.float()

def analyze_matrix_norms(W, name="Matrix"):
    """Compute various norms of a matrix."""
    
    # Move to CPU for analysis
    W_cpu = W.cpu()
    
    results = {
        'shape': list(W_cpu.shape),
        'frobenius_norm': float(torch.norm(W_cpu, p='fro').item()),
        'spectral_norm': float(torch.linalg.matrix_norm(W_cpu, ord=2).item()),
        'max_singular': float(torch.linalg.svdvals(W_cpu)[0].item()),
        'mean_row_norm': float(torch.norm(W_cpu, dim=1).mean().item()),
        'std_row_norm': float(torch.norm(W_cpu, dim=1).std().item()),
        'max_row_norm': float(torch.norm(W_cpu, dim=1).max().item()),
        'min_row_norm': float(torch.norm(W_cpu, dim=1).min().item()),
        'mean_abs': float(W_cpu.abs().mean().item()),
        'std_abs': float(W_cpu.abs().std().item())
    }
    
    # Normalized by dimensions
    vocab_size, hidden_dim = W_cpu.shape
    results['frobenius_normalized'] = results['frobenius_norm'] / np.sqrt(vocab_size * hidden_dim)
    results['spectral_normalized'] = results['spectral_norm'] / np.sqrt(hidden_dim)
    
    print(f"\n{name} Analysis:")
    print(f"  Shape: {results['shape']}")
    print(f"  Frobenius Norm: {results['frobenius_norm']:.2f}")
    print(f"  Spectral Norm: {results['spectral_norm']:.2f}")
    print(f"  Max Singular Value: {results['max_singular']:.2f}")
    print(f"  Mean Row Norm: {results['mean_row_norm']:.4f}")
    print(f"  Row Norm Std: {results['std_row_norm']:.4f}")
    
    return results

In [None]:
# Cell 5: Run Unembedding Analysis on All Models

unembedding_results = {}

for model_key in selected:
    print(f"\n{'='*60}")
    print(f"Analyzing {model_key}")
    print(f"{'='*60}")
    
    model, tokenizer, info = load_model(model_key)
    if model is None:
        continue
    
    family = info['family']
    
    # Get matrices
    try:
        W_U = get_unembedding_matrix(model, family)
        W_E = get_embedding_matrix(model, family)
        
        # Analyze
        unembed_analysis = analyze_matrix_norms(W_U, f"{model_key} Unembedding (W_U)")
        embed_analysis = analyze_matrix_norms(W_E, f"{model_key} Embedding (W_E)")
        
        # Compute ratio
        ratio = unembed_analysis['frobenius_norm'] / embed_analysis['frobenius_norm']
        print(f"\n  W_U / W_E Frobenius Ratio: {ratio:.2f}")
        
        # Check if tied embeddings
        tied = torch.allclose(W_U, W_E, atol=1e-3)
        print(f"  Tied Embeddings: {tied}")
        
        unembedding_results[model_key] = {
            'family': family,
            'unembedding': unembed_analysis,
            'embedding': embed_analysis,
            'wu_we_ratio': float(ratio),
            'tied_embeddings': tied
        }
        
    except Exception as e:
        print(f"  Error: {e}")
    
    # Cleanup
    del model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print(f"\n\nCompleted unembedding analysis for {len(unembedding_results)} models.")

---
## TEST 2: FFN Expansion with Corrected Hooks

**Problem:** Previous hooks may have measured post-RMSNorm activations.

**Solution:** Hook BEFORE and AFTER the layer, not just the submodules.

In [None]:
# Cell 6: Corrected Hook System - Measure Residual Stream

class ResidualStreamAnalyzer:
    """Analyze the residual stream directly, bypassing RMSNorm issues."""
    
    def __init__(self, model, model_info):
        self.model = model
        self.model_info = model_info
        self.hooks = []
        self.residual_norms = []
        
    def _get_layers(self):
        """Get transformer layers."""
        family = self.model_info['family']
        
        if family == 'pythia':
            return self.model.gpt_neox.layers
        elif family in ['mistral', 'llama', 'gemma']:
            return self.model.model.layers
        else:
            raise ValueError(f"Unknown family: {family}")
    
    def _make_hook(self, layer_idx):
        """Create hook that captures residual stream norm AFTER each layer."""
        def hook(module, args, output):
            # Output is (hidden_states, ...) or just hidden_states
            if isinstance(output, tuple):
                hidden = output[0]
            else:
                hidden = output
            
            with torch.no_grad():
                norm = hidden.float().norm().item()
                self.residual_norms.append((layer_idx, norm))
        
        return hook
    
    def register_hooks(self):
        """Register hooks on each layer output."""
        layers = self._get_layers()
        
        for i, layer in enumerate(layers):
            h = layer.register_forward_hook(self._make_hook(i))
            self.hooks.append(h)
        
        print(f"  Registered {len(self.hooks)} residual stream hooks")
    
    def remove_hooks(self):
        for h in self.hooks:
            h.remove()
        self.hooks = []
    
    def clear(self):
        self.residual_norms = []
    
    def get_layer_gains(self):
        """Compute gain = norm(layer_i) / norm(layer_{i-1})."""
        gains = []
        
        # Sort by layer index
        sorted_norms = sorted(self.residual_norms, key=lambda x: x[0])
        
        for i in range(1, len(sorted_norms)):
            prev_norm = sorted_norms[i-1][1]
            curr_norm = sorted_norms[i][1]
            
            if prev_norm > 1e-8:
                gain = curr_norm / prev_norm
            else:
                gain = 0.0
            
            gains.append(float(gain))
        
        return gains, [n for _, n in sorted_norms]

In [None]:
# Cell 7: Run Residual Stream Analysis

residual_results = {}

for model_key in selected:
    print(f"\n{'='*60}")
    print(f"Residual Stream Analysis: {model_key}")
    print(f"{'='*60}")
    
    model, tokenizer, info = load_model(model_key)
    if model is None:
        continue
    
    # Setup analyzer
    analyzer = ResidualStreamAnalyzer(model, info)
    analyzer.register_hooks()
    
    # Test prompt
    prompt = "The capital of France is"
    inputs = tokenizer(prompt, return_tensors='pt')
    
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    
    # Forward pass
    print(f"  Running forward pass...")
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get gains
    gains, norms = analyzer.get_layer_gains()
    
    if gains:
        # Statistics
        n_layers = len(gains)
        contracting = sum(1 for g in gains if g < 1.0)
        last_gain = gains[-1] if gains else 0
        max_gain = max(gains) if gains else 0
        max_gain_layer = gains.index(max_gain) if gains else -1
        
        print(f"\n  Results:")
        print(f"  Layers: {n_layers}")
        print(f"  Contracting: {contracting}/{n_layers} ({100*contracting/n_layers:.1f}%)")
        print(f"  Last Layer Gain: {last_gain:.4f}")
        print(f"  Max Gain: {max_gain:.4f} at Layer {max_gain_layer}")
        print(f"  Final Norm: {norms[-1]:.2f}")
        
        # Find explosion point
        expansion_layers = [(i, g) for i, g in enumerate(gains) if g > 1.0]
        if expansion_layers:
            print(f"\n  Expansion Layers (gain > 1):")
            for i, g in expansion_layers[:5]:  # Top 5
                print(f"    Layer {i}: {g:.4f}")
        
        residual_results[model_key] = {
            'family': info['family'],
            'n_layers': n_layers,
            'gains': gains,
            'norms': norms,
            'contracting_pct': float(100 * contracting / n_layers),
            'last_gain': float(last_gain),
            'max_gain': float(max_gain),
            'max_gain_layer': int(max_gain_layer),
            'expansion_layers': [(int(i), float(g)) for i, g in expansion_layers]
        }
    
    # Cleanup
    analyzer.remove_hooks()
    del model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print(f"\n\nCompleted residual stream analysis for {len(residual_results)} models.")

---
## TEST 3: Output Logit Entropy Analysis

**Hypothesis:** Mistral produces sharper logits (lower entropy) without needing FFN explosion.

In [None]:
# Cell 8: Logit Entropy Analysis

def analyze_logit_entropy(model, tokenizer, prompt="The capital of France is"):
    """Analyze the entropy of output logits before softmax."""
    
    inputs = tokenizer(prompt, return_tensors='pt')
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  # [batch, seq, vocab]
    
    # Take last token's logits
    last_logits = logits[0, -1, :].float().cpu()
    
    # Compute statistics before softmax
    logit_mean = last_logits.mean().item()
    logit_std = last_logits.std().item()
    logit_max = last_logits.max().item()
    logit_min = last_logits.min().item()
    logit_range = logit_max - logit_min
    
    # Compute entropy after softmax
    probs = torch.softmax(last_logits, dim=0)
    entropy = -(probs * torch.log(probs + 1e-10)).sum().item()
    
    # Effective number of choices (exp(entropy))
    effective_vocab = np.exp(entropy)
    
    # Top-k concentration
    top_k_probs = probs.topk(10).values.sum().item()
    
    results = {
        'logit_mean': float(logit_mean),
        'logit_std': float(logit_std),
        'logit_max': float(logit_max),
        'logit_min': float(logit_min),
        'logit_range': float(logit_range),
        'entropy': float(entropy),
        'effective_vocab': float(effective_vocab),
        'top10_prob_mass': float(top_k_probs)
    }
    
    return results

In [None]:
# Cell 9: Run Entropy Analysis

entropy_results = {}

for model_key in selected:
    print(f"\n{'='*60}")
    print(f"Entropy Analysis: {model_key}")
    print(f"{'='*60}")
    
    model, tokenizer, info = load_model(model_key)
    if model is None:
        continue
    
    try:
        results = analyze_logit_entropy(model, tokenizer)
        
        print(f"\n  Logit Statistics (before softmax):")
        print(f"    Mean: {results['logit_mean']:.2f}")
        print(f"    Std: {results['logit_std']:.2f}")
        print(f"    Range: {results['logit_range']:.2f}")
        print(f"\n  Entropy Statistics:")
        print(f"    Entropy: {results['entropy']:.2f} nats")
        print(f"    Effective Vocab: {results['effective_vocab']:.0f} tokens")
        print(f"    Top-10 Prob Mass: {results['top10_prob_mass']:.2%}")
        
        entropy_results[model_key] = {
            'family': info['family'],
            **results
        }
        
    except Exception as e:
        print(f"  Error: {e}")
    
    # Cleanup
    del model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print(f"\n\nCompleted entropy analysis for {len(entropy_results)} models.")

In [None]:
# Cell 10: Visualization

import os
results_dir = '../Results'
os.makedirs(results_dir, exist_ok=True)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

colors = {'mistral': 'purple', 'pythia': 'blue', 'gemma': 'red', 'llama': 'green'}

# Panel 1: Residual Stream Gains
ax1 = axes[0, 0]
for model_key, res in residual_results.items():
    color = colors.get(res['family'], 'gray')
    layers = range(len(res['gains']))
    ax1.plot(layers, res['gains'], 'o-', label=model_key, color=color, alpha=0.7)
ax1.axhline(y=1.0, color='black', linestyle='--', alpha=0.5)
ax1.set_xlabel('Layer')
ax1.set_ylabel('Residual Stream Gain')
ax1.set_title('Residual Stream Gain per Layer')
ax1.legend()
ax1.set_ylim(0.9, 1.1)

# Panel 2: Residual Stream Norms
ax2 = axes[0, 1]
for model_key, res in residual_results.items():
    color = colors.get(res['family'], 'gray')
    layers = range(len(res['norms']))
    ax2.plot(layers, res['norms'], 's-', label=model_key, color=color, alpha=0.7)
ax2.set_xlabel('Layer')
ax2.set_ylabel('Residual Norm')
ax2.set_title('Residual Stream Norm per Layer')
ax2.legend()

# Panel 3: Unembedding Norms Comparison
ax3 = axes[1, 0]
if unembedding_results:
    models = list(unembedding_results.keys())
    wu_norms = [unembedding_results[m]['unembedding']['spectral_norm'] for m in models]
    we_norms = [unembedding_results[m]['embedding']['spectral_norm'] for m in models]
    
    x = np.arange(len(models))
    width = 0.35
    
    ax3.bar(x - width/2, wu_norms, width, label='W_U (Unembedding)', color='darkred')
    ax3.bar(x + width/2, we_norms, width, label='W_E (Embedding)', color='darkblue')
    
    ax3.set_xlabel('Model')
    ax3.set_ylabel('Spectral Norm')
    ax3.set_title('Embedding vs Unembedding Spectral Norm')
    ax3.set_xticks(x)
    ax3.set_xticklabels(models, rotation=15)
    ax3.legend()

# Panel 4: Entropy Comparison
ax4 = axes[1, 1]
if entropy_results:
    models = list(entropy_results.keys())
    entropies = [entropy_results[m]['entropy'] for m in models]
    eff_vocabs = [entropy_results[m]['effective_vocab'] for m in models]
    
    ax4_twin = ax4.twinx()
    
    x = np.arange(len(models))
    ax4.bar(x - 0.2, entropies, 0.4, label='Entropy (nats)', color='green', alpha=0.7)
    ax4_twin.bar(x + 0.2, eff_vocabs, 0.4, label='Effective Vocab', color='orange', alpha=0.7)
    
    ax4.set_xlabel('Model')
    ax4.set_ylabel('Entropy (nats)', color='green')
    ax4_twin.set_ylabel('Effective Vocab Size', color='orange')
    ax4.set_title('Output Distribution Sharpness')
    ax4.set_xticks(x)
    ax4.set_xticklabels(models, rotation=15)

plt.tight_layout()
output_path = f'{results_dir}/mistral_paradox_investigation.png'
plt.savefig(output_path, dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: {output_path}")

In [None]:
# Cell 11: Summary and Conclusions

print("="*60)
print("MISTRAL PARADOX INVESTIGATION SUMMARY")
print("="*60)

print("\n1. UNEMBEDDING MATRIX ANALYSIS")
print("-" * 40)
if unembedding_results:
    for m, res in unembedding_results.items():
        print(f"  {m}:")
        print(f"    W_U Spectral Norm: {res['unembedding']['spectral_norm']:.2f}")
        print(f"    W_E Spectral Norm: {res['embedding']['spectral_norm']:.2f}")
        print(f"    W_U/W_E Ratio: {res['wu_we_ratio']:.2f}")
        print(f"    Tied: {res['tied_embeddings']}")

print("\n2. RESIDUAL STREAM ANALYSIS")
print("-" * 40)
if residual_results:
    for m, res in residual_results.items():
        print(f"  {m}:")
        print(f"    Contracting: {res['contracting_pct']:.1f}%")
        print(f"    Last Layer Gain: {res['last_gain']:.4f}")
        print(f"    Max Gain: {res['max_gain']:.4f} at Layer {res['max_gain_layer']}")
        if res['expansion_layers']:
            print(f"    Expansion at: {[l for l, g in res['expansion_layers'][:3]]}")

print("\n3. ENTROPY ANALYSIS")
print("-" * 40)
if entropy_results:
    for m, res in entropy_results.items():
        print(f"  {m}:")
        print(f"    Entropy: {res['entropy']:.2f} nats")
        print(f"    Effective Vocab: {res['effective_vocab']:.0f}")
        print(f"    Logit Range: {res['logit_range']:.2f}")

print("\n" + "="*60)
print("CONCLUSIONS")
print("="*60)
print("""
Based on the results, determine:

1. SILENT EXIT HYPOTHESIS:
   - If Mistral W_U >> Pythia W_U: Confirmed (static amplifier)
   - If similar: Rejected

2. RESIDUAL STREAM EXPLOSION:
   - Check if gains are near 1.0 throughout (RMSNorm effect)
   - Or if explosion happens earlier than expected

3. ENTROPY EFFICIENCY:
   - If Mistral entropy << Pythia: Confirmed (sharper outputs)
   - If similar: Rejected
""")

In [None]:
# Cell 12: Save Results

import os
results_dir = '../Results'
os.makedirs(results_dir, exist_ok=True)

# Combine all results
output = {
    'experiment': 'Mistral Paradox Investigation',
    'date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'models_tested': list(set(list(unembedding_results.keys()) + 
                              list(residual_results.keys()) + 
                              list(entropy_results.keys()))),
    'unembedding_analysis': unembedding_results,
    'residual_stream_analysis': residual_results,
    'entropy_analysis': entropy_results,
    'hypotheses': {
        'silent_exit': 'Check W_U spectral norm comparison',
        'residual_explosion': 'Check residual stream gains',
        'entropy_efficiency': 'Check output entropy comparison'
    }
}

output_path = f'{results_dir}/mistral_paradox_investigation_results.json'
with open(output_path, 'w') as f:
    json.dump(output, f, indent=2)
print(f"Saved: {output_path}")

# Auto-download for Colab
try:
    from google.colab import files
    files.download(output_path)
    files.download(f'{results_dir}/mistral_paradox_investigation.png')
    print("\nFiles downloaded!")
except:
    pass