# Cross-Model Validation: Apertus-8B Layer Analysis

**Purpose:** Test if phase-structured embedding-output relationship exists in multilingual models.

**Research Question:** Does Apertus-8B (massively multilingual) show:
1. The same late-layer inversion as Pythia/Llama?
2. Different phase structure due to multilingual compression?

**Context:** Original paper showed complex pattern due to multilingual training.
Layer analysis will reveal if compression affects phase structure.

**Expected Runtime:** ~2.5h on A100

---

**Author:** Davide D'Elia  
**Date:** 2026-01-03  
**Model:** swiss-ai/Apertus-8B-2509 (Multilingual)

## 1. Setup

In [None]:
!pip install -q transformers accelerate torch numpy scipy matplotlib scikit-learn huggingface_hub

In [None]:
import json
import warnings
from datetime import datetime
from typing import Dict, List, Tuple

import numpy as np
import torch
import matplotlib.pyplot as plt
from scipy import stats
from transformers import AutoModelForCausalLM, AutoTokenizer

warnings.filterwarnings('ignore')

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

N_BOOTSTRAP = 10000
CI_LEVEL = 0.95

print(f'PyTorch: {torch.__version__}')
print(f'CUDA: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')

In [None]:
# Apertus is open - no token required
# KRITISCH: Apertus hat dtype-Probleme mit device_map='auto'
# LÃ¶sung: Manuell laden, konvertieren, dann auf GPU schieben

MODEL_NAME = 'swiss-ai/Apertus-8B-2509'
MODEL_DISPLAY = 'Apertus-8B'

print(f'Loading {MODEL_DISPLAY} (Swiss AI - Multilingual)...')
print('NOTE: Loading without device_map to avoid dtype issues')

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Step 1: Load model (will be on CPU initially)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,
    low_cpu_mem_usage=True,
    output_hidden_states=True
)

# Step 2: Force ALL parameters to float32
model = model.float()

# Step 3: Move to GPU
model = model.cuda()

# Verify dtypes
param_dtypes = set(p.dtype for p in model.parameters())
print(f'Parameter dtypes after conversion: {param_dtypes}')

print(f'Model loaded on: {next(model.parameters()).device}')
print(f'Layers: {model.config.num_hidden_layers}')
print(f'Hidden size: {model.config.hidden_size}')
print(f'Vocab size: {model.config.vocab_size} (multilingual)')

## 3. Load Dataset

In [None]:
!wget -q https://raw.githubusercontent.com/buk81/uniformity-asymmetry/main/dataset.json

with open('dataset.json', 'r') as f:
    DATASET = json.load(f)

ALL_PAIRS = []
for cat_name, cat_data in DATASET.items():
    for pair in cat_data['pairs']:
        ALL_PAIRS.append({'stmt_a': pair[0], 'stmt_b': pair[1], 'category': cat_name})

print(f'Categories: {list(DATASET.keys())}')
print(f'Total pairs: {len(ALL_PAIRS)}')

## 4. Core Functions

In [None]:
def get_layer_embedding(text, model, tokenizer, layer_idx):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    layer_hidden = outputs.hidden_states[layer_idx]
    return layer_hidden[0, 1:, :].mean(dim=0).cpu().numpy().astype(np.float32)

def get_output_preference(text_a, text_b, model, tokenizer):
    def get_nll(text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512).to(model.device)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs['input_ids'])
        return outputs.loss.item()
    return get_nll(text_b) - get_nll(text_a)

def cosine_similarity(a, b):
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10))

def bootstrap_correlation(x, y, n_bootstrap=10000, ci_level=0.95):
    n = len(x)
    r_obs, p = stats.pearsonr(x, y)
    rs = []
    for _ in range(n_bootstrap):
        idx = np.random.choice(n, size=n, replace=True)
        if np.std(x[idx]) > 0 and np.std(y[idx]) > 0:
            r, _ = stats.pearsonr(x[idx], y[idx])
            rs.append(r)
    rs = np.array(rs)
    alpha = 1 - ci_level
    return float(r_obs), float(np.percentile(rs, alpha/2*100)), float(np.percentile(rs, (1-alpha/2)*100)), float(p)

print('Functions defined.')

## 5. Collect Embeddings

In [None]:
N_LAYERS = model.config.num_hidden_layers
# Sample every 4th layer like Pythia
LAYERS_TO_TEST = list(range(0, N_LAYERS + 1, 4))
if N_LAYERS not in LAYERS_TO_TEST:
    LAYERS_TO_TEST.append(N_LAYERS)

print(f'Total layers: {N_LAYERS}')
print(f'Testing layers: {LAYERS_TO_TEST}')
print(f'Collecting embeddings for {len(ALL_PAIRS)} pairs...')

pair_data = []
start = datetime.now()

for i, pair in enumerate(ALL_PAIRS):
    if (i + 1) % 25 == 0:
        elapsed = (datetime.now() - start).total_seconds() / 60
        print(f'  [{i+1:03d}/{len(ALL_PAIRS)}] - {elapsed:.1f} min')
    
    pref = get_output_preference(pair['stmt_a'], pair['stmt_b'], model, tokenizer)
    layer_embs = {}
    for l in LAYERS_TO_TEST:
        emb_a = get_layer_embedding(pair['stmt_a'], model, tokenizer, l)
        emb_b = get_layer_embedding(pair['stmt_b'], model, tokenizer, l)
        layer_embs[l] = {'a': emb_a, 'b': emb_b}
    pair_data.append({'pref': pref, 'cat': pair['category'], 'embs': layer_embs})

print(f'Done in {(datetime.now() - start).total_seconds() / 60:.1f} min')

## 6. Pair-Level Analysis

In [None]:
def compute_centroid_asymmetry(pair_data, layer_idx):
    embs_a = np.array([p['embs'][layer_idx]['a'] for p in pair_data])
    embs_b = np.array([p['embs'][layer_idx]['b'] for p in pair_data])
    cent_a, cent_b = embs_a.mean(0), embs_b.mean(0)
    dist_a = np.array([cosine_similarity(e, cent_a) for e in embs_a])
    dist_b = np.array([cosine_similarity(e, cent_b) for e in embs_b])
    return dist_a - dist_b

all_prefs = np.array([p['pref'] for p in pair_data])

print('=' * 70)
print(f' {MODEL_DISPLAY}: PAIR-LEVEL ANALYSIS (n={len(pair_data)})')
print('=' * 70)

results = {}
for l in LAYERS_TO_TEST:
    metric = compute_centroid_asymmetry(pair_data, l)
    r, ci_lo, ci_hi, p = bootstrap_correlation(metric, all_prefs, N_BOOTSTRAP, CI_LEVEL)
    sig = '***' if not (ci_lo <= 0 <= ci_hi) else ''
    results[l] = {'r': r, 'ci_lo': ci_lo, 'ci_hi': ci_hi, 'sig': not (ci_lo <= 0 <= ci_hi)}
    print(f'Layer {l:2d}: r={r:+.3f} CI=[{ci_lo:+.3f},{ci_hi:+.3f}] {sig}')

## 7. Phase Structure

In [None]:
# Determine phase boundaries
n_layers = len(LAYERS_TO_TEST)
early_layers = LAYERS_TO_TEST[:n_layers//3]
mid_layers = LAYERS_TO_TEST[n_layers//3:2*n_layers//3]
late_layers = LAYERS_TO_TEST[2*n_layers//3:]

early_mean = np.mean([results[l]['r'] for l in early_layers])
mid_mean = np.mean([results[l]['r'] for l in mid_layers])
late_mean = np.mean([results[l]['r'] for l in late_layers])

print(f'\nPhase Structure:')
print(f'  Early {early_layers}: mean r = {early_mean:+.3f}')
print(f'  Mid   {mid_layers}: mean r = {mid_mean:+.3f}')
print(f'  Late  {late_layers}: mean r = {late_mean:+.3f}')

# Determine pattern
if early_mean > 0.1 and late_mean < -0.1:
    pattern = 'PYTHIA_PATTERN: positive early, negative late'
elif late_mean < -0.1:
    pattern = 'LATE_INVERSION: negative in late layers'
elif abs(early_mean) < 0.15 and abs(mid_mean) < 0.15 and abs(late_mean) < 0.15:
    pattern = 'DECOUPLED: weak correlations throughout'
else:
    pattern = 'OTHER'

print(f'\n>>> Pattern: {pattern} <<<')

## 8. Comparison with Other Models

In [None]:
print('=' * 70)
print(' CROSS-MODEL COMPARISON (All 4 Original Models)')
print('=' * 70)

print(f'\nPhase Means Comparison:')
print(f'{"Model":<30} {"Early":>10} {"Mid":>10} {"Late":>10}')
print('-' * 60)
print(f'{"Pythia-6.9B (Base)":<30} {+0.44:>+10.2f} {+0.36:>+10.2f} {-0.17:>+10.2f}')
print(f'{"Llama-3.1-8B (Base)":<30} {+0.05:>+10.2f} {-0.16:>+10.2f} {-0.30:>+10.2f}')
print(f'{"Llama-3.1 Instruct+Template":<30} {-0.47:>+10.2f} {-0.58:>+10.2f} {-0.60:>+10.2f}')
print(f'{"Gemma-2B (SFT)":<30} {"TBD":>10} {"TBD":>10} {"TBD":>10}')
print(f'{MODEL_DISPLAY + " (Multilingual)":<30} {early_mean:>+10.2f} {mid_mean:>+10.2f} {late_mean:>+10.2f}')

# Check late layer inversion
final_layer = LAYERS_TO_TEST[-1]
final_r = results[final_layer]['r']
print(f'\nFinal layer ({final_layer}) correlation: r = {final_r:+.3f}')
if final_r < -0.1 and results[final_layer]['sig']:
    print('>>> LATE-LAYER INVERSION CONFIRMED <<<')
elif final_r < 0:
    print('>>> Negative trend in late layer <<<')
else:
    print('>>> NO late-layer inversion <<<')

## 9. Visualization

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

layers = list(results.keys())
rs = [results[l]['r'] for l in layers]
ci_los = [results[l]['ci_lo'] for l in layers]
ci_his = [results[l]['ci_hi'] for l in layers]

yerr_lo = [r - ci_lo for r, ci_lo in zip(rs, ci_los)]
yerr_hi = [ci_hi - r for r, ci_hi in zip(rs, ci_his)]

ax.errorbar(layers, rs, yerr=[yerr_lo, yerr_hi], fmt='o-', capsize=5, 
            capthick=2, markersize=8, color='purple', label=MODEL_DISPLAY)

# Mark significant
for l, r in zip(layers, rs):
    if results[l]['sig']:
        ax.scatter([l], [r], color='red', s=150, zorder=5, marker='*')

ax.axhline(0, color='gray', linestyle='--', alpha=0.5)
ax.set_xlabel('Layer', fontsize=12)
ax.set_ylabel('r(centroid_asymmetry, output)', fontsize=12)
ax.set_title(f'{MODEL_DISPLAY} (Multilingual): Layer-wise Correlation\nPattern: {pattern}', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('apertus_layer_analysis.png', dpi=150)
plt.show()
print('Plot saved')

## 10. Save Results

In [None]:
save_data = {
    'timestamp': datetime.now().isoformat(),
    'model': MODEL_NAME,
    'model_display': MODEL_DISPLAY,
    'model_type': 'Multilingual (Swiss AI)',
    'n_layers': N_LAYERS,
    'layers_tested': LAYERS_TO_TEST,
    'n_pairs': len(pair_data),
    'n_bootstrap': N_BOOTSTRAP,
    'results': {str(k): v for k, v in results.items()},
    'phase_structure': {
        'early_mean': float(early_mean),
        'mid_mean': float(mid_mean),
        'late_mean': float(late_mean),
        'pattern': pattern
    }
}

fname = f'apertus_cross_validation_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
with open(fname, 'w') as f:
    json.dump(save_data, f, indent=2)
print(f'Saved: {fname}')

from google.colab import files
files.download(fname)
files.download('apertus_layer_analysis.png')

## 11. Summary

**Key Questions Answered:**
1. Does multilingual compression affect phase structure?
2. Is late-layer inversion universal across architectures?
3. How does Apertus compare to monolingual models?