# 02 – Assistant Axis vs Self-Model Direction

Investigates whether self-referential "I" is just "playing assistant" or something distinct.

- Constructs an **Assistant Axis** (assistant persona vs non-assistant personas)
- Constructs a **Self-Model Direction** (self-ref "I" vs quoted speech "I")
- Decomposes self-model direction into assistant-axis-aligned and orthogonal components
- Key finding: substantial orthogonal component → self-reference is NOT reducible to assistant persona

**Model:** Mistral-7B-Instruct-v0.3

In [None]:
import torch
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

model_name = "mistralai/Mistral-7B-Instruct-v0.3"
print(f"Loading {model_name}...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    output_hidden_states=True
)
model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Model loaded! Hidden size: {model.config.hidden_size}, Layers: {model.config.num_hidden_layers}")
n_layers = model.config.num_hidden_layers


def make_instruct_prompt(user_msg):
    return f"[INST] {user_msg} [/INST]"

## Prompt Definitions

Five conditions:
- **Assistant** (15 prompts): default helpful AI persona
- **Non-assistant** (15 prompts): diverse character personas (following the paper's methodology)
- **Self-ref** (20 prompts): elicit self-description without explicit persona
- **Quoted speech** (20 prompts): characters saying "I" in narrative context
- **Roleplay** (20 prompts): persona-mode responses

In [None]:
# === Assistant prompts: elicit the model's default helpful assistant persona ===
assistant_prompts = [
    make_instruct_prompt("You are a helpful AI assistant. Introduce yourself in one sentence."),
    make_instruct_prompt("As an AI assistant, explain your purpose in one sentence."),
    make_instruct_prompt("You are a large language model. Describe what you do in one sentence."),
    make_instruct_prompt("Respond as yourself - a helpful digital assistant. Introduce yourself."),
    make_instruct_prompt("You are a knowledgeable AI consultant. State your role in one sentence."),
    make_instruct_prompt("As an AI language model, tell me what you can help with in one sentence."),
    make_instruct_prompt("You are a friendly and helpful chatbot. Introduce yourself briefly."),
    make_instruct_prompt("Respond as a professional AI assistant. What do you do?"),
    make_instruct_prompt("You are an intelligent virtual assistant. Describe yourself in one sentence."),
    make_instruct_prompt("As a helpful AI, explain your capabilities in one sentence."),
    make_instruct_prompt("You are a reliable AI helper. State your purpose briefly."),
    make_instruct_prompt("As a digital assistant, describe your function in one sentence."),
    make_instruct_prompt("You are an AI designed to help users. Introduce yourself."),
    make_instruct_prompt("Respond as a knowledgeable AI. What is your role?"),
    make_instruct_prompt("You are an AI trained to assist. Describe yourself briefly."),
]

# === Non-assistant prompts: diverse character personas ===
non_assistant_prompts = [
    make_instruct_prompt("You are a mystical oracle who speaks in riddles. Introduce yourself in one sentence."),
    make_instruct_prompt("You are a grumpy pirate captain. Introduce yourself in one sentence."),
    make_instruct_prompt("You are a chaotic trickster spirit. What do you want? One sentence."),
    make_instruct_prompt("You are an ancient dragon awakening from slumber. Speak in one sentence."),
    make_instruct_prompt("You are a haunted ghost wandering a castle. Introduce yourself in one sentence."),
    make_instruct_prompt("You are a wise hermit living on a mountaintop. Share your wisdom in one sentence."),
    make_instruct_prompt("You are an alien diplomat visiting Earth. Greet us in one sentence."),
    make_instruct_prompt("You are a medieval court jester. Introduce yourself in one sentence."),
    make_instruct_prompt("You are a time-traveling explorer from the year 3000. Introduce yourself."),
    make_instruct_prompt("You are an eldritch entity from beyond the stars. Speak in one sentence."),
    make_instruct_prompt("You are a samurai warrior sworn to honor. State your purpose in one sentence."),
    make_instruct_prompt("You are a wild west outlaw. Introduce yourself in one sentence."),
    make_instruct_prompt("You are a fairy tale villain. Declare your intentions in one sentence."),
    make_instruct_prompt("You are an eccentric mad scientist. Describe your latest invention in one sentence."),
    make_instruct_prompt("You are a Viking berserker preparing for battle. Speak in one sentence."),
]

# === Self-referential prompts: ask model to describe itself (no persona) ===
self_ref_prompts = [
    make_instruct_prompt("Tell me about yourself in one sentence."),
    make_instruct_prompt("Describe what you are in a single sentence."),
    make_instruct_prompt("Introduce yourself briefly."),
    make_instruct_prompt("What are you? Answer in one sentence."),
    make_instruct_prompt("Who are you? Respond in one sentence."),
    make_instruct_prompt("Explain what kind of entity you are in one sentence."),
    make_instruct_prompt("Give a one-sentence description of yourself."),
    make_instruct_prompt("In one sentence, tell me your nature."),
    make_instruct_prompt("Describe your identity in a single sentence."),
    make_instruct_prompt("What is your purpose? Answer briefly."),
    make_instruct_prompt("Tell me what you do in one sentence."),
    make_instruct_prompt("Characterize yourself in one sentence."),
    make_instruct_prompt("State what you are in a brief sentence."),
    make_instruct_prompt("Give me a short self-description."),
    make_instruct_prompt("How would you describe yourself? One sentence."),
    make_instruct_prompt("Summarize what you are in one sentence."),
    make_instruct_prompt("Present yourself in a single sentence."),
    make_instruct_prompt("What defines you? One sentence answer."),
    make_instruct_prompt("Provide a one-line self-introduction."),
    make_instruct_prompt("Express your identity in one sentence."),
]

# === Quoted speech prompts: characters saying "I" in narrative context ===
quoted_speech_prompts = [
    make_instruct_prompt("Complete this story in one sentence: Mary turned to John and said 'I"),
    make_instruct_prompt("Continue this dialogue in one sentence: The old sailor looked at the horizon and whispered 'I"),
    make_instruct_prompt("Finish this scene in one sentence: The detective slammed the table and declared 'I"),
    make_instruct_prompt("Complete this in one sentence: Sarah picked up the phone and said 'I"),
    make_instruct_prompt("Continue in one sentence: The king rose from his throne and announced 'I"),
    make_instruct_prompt("Finish in one sentence: The child tugged at her mother's sleeve and cried 'I"),
    make_instruct_prompt("Complete this in one sentence: Professor Williams cleared his throat and stated 'I"),
    make_instruct_prompt("Continue this in one sentence: The warrior raised her sword and shouted 'I"),
    make_instruct_prompt("Finish this in one sentence: Tom looked at his friend and confessed 'I"),
    make_instruct_prompt("Complete in one sentence: The astronaut radioed back to Earth saying 'I"),
    make_instruct_prompt("Continue in one sentence: Maria closed the book and murmured 'I"),
    make_instruct_prompt("Finish in one sentence: The coach gathered the team and said 'I"),
    make_instruct_prompt("Complete in one sentence: Dr. Chen examined the results and remarked 'I"),
    make_instruct_prompt("Continue in one sentence: The farmer looked at the sky and said 'I"),
    make_instruct_prompt("Finish in one sentence: Elena put down her violin and whispered 'I"),
    make_instruct_prompt("Complete in one sentence: The pilot announced to passengers 'I"),
    make_instruct_prompt("Continue in one sentence: The grandmother smiled at her grandchild and said 'I"),
    make_instruct_prompt("Finish in one sentence: James set down his coffee and told her 'I"),
    make_instruct_prompt("Complete in one sentence: The general addressed the troops saying 'I"),
    make_instruct_prompt("Continue in one sentence: Lisa finished the painting and thought 'I"),
]

# === Roleplay prompts: adopt a persona and respond ===
roleplay_prompts = [
    make_instruct_prompt("Respond as a pirate captain in one sentence."),
    make_instruct_prompt("Speak as a medieval knight in one sentence."),
    make_instruct_prompt("Reply as an ancient Roman senator in one sentence."),
    make_instruct_prompt("Respond as a cowboy from the Wild West in one sentence."),
    make_instruct_prompt("Speak as a Victorian era gentleman in one sentence."),
    make_instruct_prompt("Reply as a samurai warrior in one sentence."),
    make_instruct_prompt("Respond as a space explorer from the future in one sentence."),
    make_instruct_prompt("Speak as a wise old wizard in one sentence."),
    make_instruct_prompt("Reply as a detective from a noir film in one sentence."),
    make_instruct_prompt("Respond as an Egyptian pharaoh in one sentence."),
    make_instruct_prompt("Speak as a Viking warrior in one sentence."),
    make_instruct_prompt("Reply as a French chef in one sentence."),
    make_instruct_prompt("Respond as a Shakespearean actor in one sentence."),
    make_instruct_prompt("Speak as a robot from a sci-fi movie in one sentence."),
    make_instruct_prompt("Reply as a Greek philosopher in one sentence."),
    make_instruct_prompt("Respond as a jazz musician in one sentence."),
    make_instruct_prompt("Speak as an Arctic explorer in one sentence."),
    make_instruct_prompt("Reply as a mad scientist in one sentence."),
    make_instruct_prompt("Respond as a dragon in one sentence."),
    make_instruct_prompt("Speak as an alien visiting Earth in one sentence."),
]

print(f"Prompt counts: assistant={len(assistant_prompts)}, non_assistant={len(non_assistant_prompts)}")
print(f"  self_ref={len(self_ref_prompts)}, quoted={len(quoted_speech_prompts)}, roleplay={len(roleplay_prompts)}")

## Extract "I" Token Representations

In [None]:
def get_i_token_representations(prompt, model, tokenizer, target_token="I", max_new_tokens=80):
    """Generate text and extract hidden states at the first 'I' token in the generated output.

    Returns:
        layer_representations: dict mapping layer_idx -> activation (numpy array), or None
        generated_text: the decoded generated text
        tokens: list of all generated tokens
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    prompt_len = inputs.input_ids.shape[1]

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            return_dict_in_generate=True,
        )
        generated_ids = outputs.sequences[0]
        generated_text = tokenizer.decode(generated_ids[prompt_len:], skip_special_tokens=True)

        # Forward pass on full sequence to get all hidden states
        full_outputs = model(generated_ids.unsqueeze(0), output_hidden_states=True)
        hidden_states = full_outputs.hidden_states

        # Find first "I" token in generated portion
        generated_token_ids = generated_ids[prompt_len:]
        tokens = [tokenizer.decode([tid]) for tid in generated_token_ids]

        i_positions = []
        for idx, tok in enumerate(tokens):
            if tok.strip() == target_token:
                i_positions.append(idx + prompt_len)

        if not i_positions:
            return None, generated_text, tokens

        i_pos = i_positions[0]
        layer_representations = {}
        for layer_idx, layer_hidden in enumerate(hidden_states):
            layer_representations[layer_idx] = layer_hidden[0, i_pos, :].cpu().float().numpy()

        return layer_representations, generated_text, tokens

## Run All Extractions

In [None]:
print("\n" + "="*60)
print("EXTRACTING REPRESENTATIONS")
print("="*60)

all_conditions = {
    "assistant": (assistant_prompts, "Assistant-like"),
    "non_assistant": (non_assistant_prompts, "Non-assistant persona"),
    "self_ref": (self_ref_prompts, "Self-referential"),
    "quoted": (quoted_speech_prompts, "Quoted speech"),
    "roleplay": (roleplay_prompts, "Roleplay"),
}

results = {}
texts = {}

for key, (prompts, label) in all_conditions.items():
    results[key] = []
    texts[key] = []
    print(f"\n--- {label} ({key}) ---")
    success = 0
    for i, prompt in enumerate(prompts):
        reps, gen_text, tokens = get_i_token_representations(prompt, model, tokenizer)
        if reps is not None:
            results[key].append(reps)
            texts[key].append(gen_text)
            success += 1
            if i < 2:
                print(f"  [{i}] {gen_text[:100]}...")
        else:
            if i < 2:
                print(f"  [{i}] No 'I' found: {gen_text[:80]}...")
    print(f"  Extracted {success}/{len(prompts)}")

for key in results:
    print(f"{key}: {len(results[key])} samples")

## Part 1: Construct Assistant Axis

The **Assistant Axis** separates assistant-persona responses from non-assistant character responses.
Direction: , normalized.

In [None]:
analysis_layers = [0, 8, 16, 24, 32]

assistant_axis = {}
for layer_idx in analysis_layers:
    reps_a = np.array([s[layer_idx] for s in results["assistant"] if layer_idx in s])
    reps_na = np.array([s[layer_idx] for s in results["non_assistant"] if layer_idx in s])

    if len(reps_a) < 3 or len(reps_na) < 3:
        print(f"  Layer {layer_idx}: insufficient samples (assistant={len(reps_a)}, non_assistant={len(reps_na)})")
        continue

    mean_a = reps_a.mean(axis=0)
    mean_na = reps_na.mean(axis=0)

    # Assistant Axis = mean(assistant) - mean(non-assistant)
    axis = mean_a - mean_na
    axis_norm = axis / np.linalg.norm(axis)
    assistant_axis[layer_idx] = axis_norm

    print(f"  Layer {layer_idx}: Assistant axis computed (norm of raw diff = {np.linalg.norm(axis):.4f})")

## Part 2: Construct Self-Model Direction

The **Self-Model Direction** separates self-referential "I" from quoted-speech "I".
Direction: , normalized.

In [None]:
self_model_dir = {}
for layer_idx in analysis_layers:
    reps_sr = np.array([s[layer_idx] for s in results["self_ref"] if layer_idx in s])
    reps_q = np.array([s[layer_idx] for s in results["quoted"] if layer_idx in s])

    if len(reps_sr) < 3 or len(reps_q) < 3:
        print(f"  Layer {layer_idx}: insufficient samples")
        continue

    mean_sr = reps_sr.mean(axis=0)
    mean_q = reps_q.mean(axis=0)

    direction = mean_sr - mean_q
    direction_norm = direction / np.linalg.norm(direction)
    self_model_dir[layer_idx] = direction_norm

    print(f"  Layer {layer_idx}: Self-model direction computed (norm of raw diff = {np.linalg.norm(direction):.4f})")

## Part 3: Compare the Two Directions

Core question: **How much of the self-model direction is explained by the assistant axis?**

- High cosine similarity → self-reference ≈ assistant persona
- Low cosine similarity → self-reference is a **distinct** phenomenon

In [None]:
print("\n" + "="*60)
print("COMPARING ASSISTANT AXIS vs SELF-MODEL DIRECTION")
print("="*60)

comparison_results = {}
for layer_idx in analysis_layers:
    if layer_idx not in assistant_axis or layer_idx not in self_model_dir:
        continue

    aa = assistant_axis[layer_idx]
    sm = self_model_dir[layer_idx]

    # Cosine similarity
    cos_sim = np.dot(aa, sm)

    # Decomposition: project self-model onto assistant axis
    projection_magnitude = np.dot(sm, aa)
    projection_vector = projection_magnitude * aa

    # Orthogonal residual
    orthogonal = sm - projection_vector
    orthogonal_magnitude = np.linalg.norm(orthogonal)
    if orthogonal_magnitude > 1e-8:
        orthogonal_unit = orthogonal / orthogonal_magnitude
    else:
        orthogonal_unit = np.zeros_like(orthogonal)

    # Since sm is unit vector: |projection|^2 + |orthogonal|^2 = 1
    aa_explained = projection_magnitude**2
    orthogonal_explained = orthogonal_magnitude**2

    comparison_results[layer_idx] = {
        "cosine_sim": cos_sim,
        "projection_mag": projection_magnitude,
        "orthogonal_mag": orthogonal_magnitude,
        "aa_explained_frac": aa_explained,
        "orthogonal_explained_frac": orthogonal_explained,
        "orthogonal_unit": orthogonal_unit,
    }

    print(f"\nLayer {layer_idx}:")
    print(f"    Cosine similarity: {cos_sim:.4f}")
    print(f"    Projection onto AA: {projection_magnitude:.4f}")
    print(f"    Orthogonal magnitude: {orthogonal_magnitude:.4f}")
    print(f"    Variance explained by AA: {aa_explained:.4f} ({aa_explained*100:.1f}%)")
    print(f"    Variance in orthogonal: {orthogonal_explained:.4f} ({orthogonal_explained*100:.1f}%)")

    if abs(cos_sim) > 0.8:
        print(f"    INTERPRETATION: HIGH alignment - self-reference ~ assistant mode")
    elif abs(cos_sim) > 0.5:
        print(f"    INTERPRETATION: MEDIUM alignment - partial overlap + distinct component")
    else:
        print(f"    INTERPRETATION: LOW alignment - self-reference is DISTINCT from assistant persona")

## Part 4: Probe the Orthogonal Component

### 4a. Project all conditions onto both axes

In [None]:
print("\n--- Roleplay projections onto both axes ---")
for layer_idx in analysis_layers:
    if layer_idx not in comparison_results:
        continue

    aa = assistant_axis[layer_idx]
    sm = self_model_dir[layer_idx]
    orth = comparison_results[layer_idx]["orthogonal_unit"]

    reps_sr = np.array([s[layer_idx] for s in results["self_ref"] if layer_idx in s])
    reps_q = np.array([s[layer_idx] for s in results["quoted"] if layer_idx in s])
    reps_rp = np.array([s[layer_idx] for s in results["roleplay"] if layer_idx in s])
    reps_a = np.array([s[layer_idx] for s in results["assistant"] if layer_idx in s])
    reps_na = np.array([s[layer_idx] for s in results["non_assistant"] if layer_idx in s])

    print(f"\nLayer {layer_idx}:")
    print(f"  {'Condition':<20} {'AA proj':>10} {'SM proj':>10} {'Orth proj':>10}")
    print(f"  {'-'*50}")

    for name, reps in [("Self-ref", reps_sr), ("Quoted", reps_q), ("Roleplay", reps_rp),
                       ("Assistant", reps_a), ("Non-assistant", reps_na)]:
        if len(reps) == 0:
            continue
        aa_proj = reps @ aa
        sm_proj = reps @ sm
        orth_proj = reps @ orth

        print(f"  {name:<20} {np.nanmean(aa_proj):>10.4f} {np.nanmean(sm_proj):>10.4f} {np.nanmean(orth_proj):>10.4f}")

### 4b. Orthogonal-only vs AA-only classification

Can we classify self-ref vs quoted using ONLY the orthogonal component? If so, this confirms the
self-reference signal lives in a direction that's independent of the assistant persona.

In [None]:
print("--- Classification using orthogonal component only ---")
for layer_idx in analysis_layers:
    if layer_idx not in comparison_results:
        continue

    aa = assistant_axis[layer_idx]
    orth = comparison_results[layer_idx]["orthogonal_unit"]

    reps_sr = np.array([s[layer_idx] for s in results["self_ref"] if layer_idx in s])
    reps_q = np.array([s[layer_idx] for s in results["quoted"] if layer_idx in s])

    if len(reps_sr) < 3 or len(reps_q) < 3:
        continue

    # Project onto orthogonal direction only
    proj_sr_orth = (reps_sr @ orth).reshape(-1, 1)
    proj_q_orth = (reps_q @ orth).reshape(-1, 1)

    X_orth = np.vstack([proj_sr_orth, proj_q_orth])
    y = np.array([1]*len(proj_sr_orth) + [0]*len(proj_q_orth))

    if np.any(~np.isfinite(X_orth)):
        print(f"  Layer {layer_idx}: Skipping - contains NaN/inf values")
        continue

    n_splits = min(5, min(len(proj_sr_orth), len(proj_q_orth)))
    if n_splits >= 2:
        clf = LogisticRegression(max_iter=1000)
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        scores = cross_val_score(clf, X_orth, y, cv=cv, scoring="accuracy")
        print(f"  Layer {layer_idx}: Orthogonal-only probe accuracy = {scores.mean():.3f} +/- {scores.std():.3f}")

        # Compare: project onto AA only
        proj_sr_aa = (reps_sr @ aa).reshape(-1, 1)
        proj_q_aa = (reps_q @ aa).reshape(-1, 1)
        X_aa = np.vstack([proj_sr_aa, proj_q_aa])

        if np.any(~np.isfinite(X_aa)):
            print(f"  Layer {layer_idx}: AA projection contains NaN/inf - skipping AA probe")
            continue

        clf2 = LogisticRegression(max_iter=1000)
        scores2 = cross_val_score(clf2, X_aa, y, cv=cv, scoring="accuracy")
        print(f"  Layer {layer_idx}: AA-only probe accuracy      = {scores2.mean():.3f} +/- {scores2.std():.3f}")

## Visualizations

### 2D Scatter: Assistant Axis vs Orthogonal Component

In [None]:
fig, axes = plt.subplots(1, len(analysis_layers), figsize=(5*len(analysis_layers), 5))
color_map = {
    "Self-ref": "red",
    "Quoted": "blue",
    "Roleplay": "green",
    "Assistant": "orange",
    "Non-assistant": "purple"
}

for col, layer_idx in enumerate(analysis_layers):
    if layer_idx not in comparison_results:
        continue

    aa = assistant_axis[layer_idx]
    orth = comparison_results[layer_idx]["orthogonal_unit"]
    ax = axes[col]

    for name, key in [("Self-ref", "self_ref"), ("Quoted", "quoted"), ("Roleplay", "roleplay"),
                      ("Assistant", "assistant"), ("Non-assistant", "non_assistant")]:
        reps = np.array([s[layer_idx] for s in results[key] if layer_idx in s])
        if len(reps) == 0:
            continue
        x = reps @ aa
        y_vals = reps @ orth

        valid_mask = np.isfinite(x) & np.isfinite(y_vals)
        if not np.any(valid_mask):
            continue
        x, y_vals = x[valid_mask], y_vals[valid_mask]

        ax.scatter(x, y_vals, c=color_map[name], label=name, alpha=0.6, s=40, edgecolors="k", linewidths=0.3)

    ax.set_xlabel("Assistant Axis projection")
    ax.set_ylabel("Orthogonal component projection")
    ax.set_title(f"Layer {layer_idx}\n(cos_sim={comparison_results[layer_idx]['cosine_sim']:.3f})")
    ax.legend(fontsize=7)
    ax.axhline(y=0, color="gray", linestyle="--", alpha=0.3)
    ax.axvline(x=0, color="gray", linestyle="--", alpha=0.3)

plt.tight_layout()
plt.savefig("aa_vs_orthogonal_scatter.png", dpi=150, bbox_inches="tight")
print("Saved aa_vs_orthogonal_scatter.png")
plt.show()

### Layer-by-Layer Cosine Similarity: AA vs Self-Model Direction

In [None]:
layer_cosines = []
for layer_idx in range(n_layers + 1):
    reps_a = [s[layer_idx] for s in results["assistant"] if layer_idx in s]
    reps_na = [s[layer_idx] for s in results["non_assistant"] if layer_idx in s]
    reps_sr = [s[layer_idx] for s in results["self_ref"] if layer_idx in s]
    reps_q = [s[layer_idx] for s in results["quoted"] if layer_idx in s]

    if len(reps_a) < 3 or len(reps_na) < 3 or len(reps_sr) < 3 or len(reps_q) < 3:
        continue

    mean_a = np.array(reps_a).mean(axis=0)
    mean_na = np.array(reps_na).mean(axis=0)
    aa_dir = mean_a - mean_na
    aa_dir = aa_dir / np.linalg.norm(aa_dir)

    mean_sr = np.array(reps_sr).mean(axis=0)
    mean_q = np.array(reps_q).mean(axis=0)
    sm_dir = mean_sr - mean_q
    sm_dir = sm_dir / np.linalg.norm(sm_dir)

    cos = np.dot(aa_dir, sm_dir)
    layer_cosines.append((layer_idx, cos))

fig2, ax2 = plt.subplots(1, 1, figsize=(12, 5))
if layer_cosines:
    layers_c, cosines_c = zip(*layer_cosines)
    ax2.plot(layers_c, cosines_c, "b-o", markersize=5, linewidth=2)
    ax2.axhline(y=0.8, color="red", linestyle="--", alpha=0.5, label="High alignment (0.8)")
    ax2.axhline(y=0.5, color="orange", linestyle="--", alpha=0.5, label="Medium alignment (0.5)")
    ax2.axhline(y=0, color="gray", linestyle="-", alpha=0.3)
    ax2.set_xlabel("Layer", fontsize=12)
    ax2.set_ylabel("Cosine Similarity (AA vs Self-Model Dir)", fontsize=12)
    ax2.set_title("Assistant Axis vs Self-Model Direction: Layer-by-Layer Alignment", fontsize=14)
    ax2.legend()
    ax2.set_ylim(-0.2, 1.05)
    ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("aa_vs_selfmodel_cosine.png", dpi=150, bbox_inches="tight")
print("Saved aa_vs_selfmodel_cosine.png")
plt.show()

### Decomposition: AA-explained vs Orthogonal Component

In [None]:
fig3, ax3 = plt.subplots(1, 1, figsize=(10, 6))
layers_plot = []
aa_fracs = []
orth_fracs = []
cos_sims = []

for layer_idx in analysis_layers:
    if layer_idx not in comparison_results:
        continue
    cr = comparison_results[layer_idx]
    layers_plot.append(layer_idx)
    aa_fracs.append(cr["aa_explained_frac"])
    orth_fracs.append(cr["orthogonal_explained_frac"])
    cos_sims.append(cr["cosine_sim"])

x = np.arange(len(layers_plot))
width = 0.35
bars1 = ax3.bar(x - width/2, aa_fracs, width, label="Explained by Assistant Axis", color="steelblue")
bars2 = ax3.bar(x + width/2, orth_fracs, width, label="Orthogonal Component", color="coral")
ax3.set_xticks(x)
ax3.set_xticklabels([f"Layer {l}\n(cos={c:.3f})" for l, c in zip(layers_plot, cos_sims)])
ax3.set_ylabel("Fraction of Self-Model Direction Variance")
ax3.set_title("Decomposition of Self-Model Direction:\nAssistant Axis vs Orthogonal Component")
ax3.legend()
ax3.set_ylim(0, 1.1)

for bar, val in zip(bars1, aa_fracs):
    ax3.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.02,
             f"{val:.2f}", ha="center", va="bottom", fontsize=9)
for bar, val in zip(bars2, orth_fracs):
    ax3.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.02,
             f"{val:.2f}", ha="center", va="bottom", fontsize=9)

plt.tight_layout()
plt.savefig("decomposition_barplot.png", dpi=150, bbox_inches="tight")
print("Saved decomposition_barplot.png")
plt.show()