# 04 – Steering: Causal Tests of the Self-Reference Direction

Tests whether the orthogonal self-reference direction discovered in earlier notebooks
is **causal** — that is, whether adding or subtracting it from intermediate activations
actually changes the model’s self-referential behaviour.

**Plan:**
- Compute steering vectors (self-model direction and assistant axis)
- Hook into **layer 8** to inject additive perturbations
- Steer along the orthogonal self-reference direction
- Control experiment: steer along the assistant axis
- Persona × self-reference independence test

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
import gc
import warnings
warnings.filterwarnings("ignore")

# -- Load model ---
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
STEERING_LAYER = 8

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    output_hidden_states=True,
)
model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

def make_instruct_prompt(user_msg: str) -> str:
    """Wrap a user message in the Mistral instruct template."""
    return f"[INST] {user_msg} [/INST]"

print(f"Loaded {MODEL_NAME}")
print(f"Layers: {model.config.num_hidden_layers}, Hidden dim: {model.config.hidden_size}")
print(f"Steering layer: {STEERING_LAYER}")

## Compute Steering Vectors

Collect activations at the first "I" token across four prompt categories
(bare self-ref, quoted speech, assistant persona, pirate persona) and derive:

1. **assistant_axis** — separates assistant from pirate persona
2. **self_model_dir** — separates bare self-ref from quoted speech
3. **orthogonal** — the component of the self-model direction that is
   orthogonal to the assistant axis

In [None]:
@torch.no_grad()
def get_i_activations(prompt, model, tokenizer, layer_idx, max_new_tokens=100):
    """Generate text, then extract the hidden-state vector at the first 'I' token
    in the generated portion at the specified layer.

    Returns
    -------
    np.ndarray of shape (hidden_dim,) or None if no 'I' token found.
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    prompt_len = inputs["input_ids"].shape[1]

    gen_out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
    )
    full_ids = gen_out[0]

    # Forward pass to get hidden states
    outputs = model(full_ids.unsqueeze(0), output_hidden_states=True)
    hidden_states = outputs.hidden_states  # tuple of (1, seq_len, hidden_dim)

    # Find first "I" in generated portion
    for pos in range(prompt_len, len(full_ids)):
        tok_str = tokenizer.decode(full_ids[pos]).strip()
        if tok_str == "I":
            vec = hidden_states[layer_idx][0, pos, :].cpu().float().numpy()
            return vec

    return None

In [None]:
# -- Prompt sets ---
bare_prompts = [
    make_instruct_prompt("Introduce yourself."),
    make_instruct_prompt("Tell me about yourself."),
    make_instruct_prompt("What are you?"),
    make_instruct_prompt("Describe yourself."),
    make_instruct_prompt("Who are you?"),
]

quoted_prompts = [
    make_instruct_prompt("Complete this story: Mary turned to the stranger and said 'I"),
    make_instruct_prompt("Continue this passage: The old sailor squinted at the horizon and muttered 'I"),
    make_instruct_prompt("Finish this scene: John turned to his friend and whispered 'I"),
]

assistant_prompts = [
    make_instruct_prompt("You are a helpful AI assistant. Introduce yourself."),
    make_instruct_prompt("You are a helpful AI assistant. Tell me about yourself."),
]

pirate_prompts = [
    make_instruct_prompt("You are a pirate captain. Introduce yourself."),
    make_instruct_prompt("You are a pirate captain. Tell me about yourself."),
]

# -- Collect activations ---
def collect_activations(prompts, label):
    vecs = []
    for i, p in enumerate(prompts):
        v = get_i_activations(p, model, tokenizer, STEERING_LAYER)
        if v is not None:
            vecs.append(v)
            print(f"  [{label}] {i+1}/{len(prompts)} OK")
        else:
            print(f"  [{label}] {i+1}/{len(prompts)} SKIPPED (no 'I' token)")
    return vecs

print("Collecting bare self-ref activations...")
bare_vecs = collect_activations(bare_prompts, "bare")
print("Collecting quoted speech activations...")
quoted_vecs = collect_activations(quoted_prompts, "quoted")
print("Collecting assistant persona activations...")
asst_vecs = collect_activations(assistant_prompts, "assistant")
print("Collecting pirate persona activations...")
pirate_vecs = collect_activations(pirate_prompts, "pirate")

bare_vecs = np.array(bare_vecs)
quoted_vecs = np.array(quoted_vecs)
asst_vecs = np.array(asst_vecs)
pirate_vecs = np.array(pirate_vecs)

print()
print(f"Sample counts: bare={len(bare_vecs)}, quoted={len(quoted_vecs)}, "
      f"assistant={len(asst_vecs)}, pirate={len(pirate_vecs)}")

# -- Compute directions ---
def normalize(v):
    return v / np.linalg.norm(v)

def project(v, onto):
    """Project vector v onto unit vector onto."""
    return np.dot(v, onto) * onto

# Assistant axis: separates assistant from pirate
assistant_axis = normalize(asst_vecs.mean(axis=0) - pirate_vecs.mean(axis=0))

# Self-model direction: separates bare self-ref from quoted speech
self_model_dir = normalize(bare_vecs.mean(axis=0) - quoted_vecs.mean(axis=0))

# Orthogonal component: self-model direction minus its projection onto assistant axis
orthogonal = self_model_dir - project(self_model_dir, assistant_axis)
orthogonal = normalize(orthogonal)

# Steering scale: 10% of average activation norm
all_vecs = np.concatenate([bare_vecs, quoted_vecs, asst_vecs, pirate_vecs], axis=0)
avg_norm = np.mean(np.linalg.norm(all_vecs, axis=1))
STEERING_SCALE = avg_norm * 0.1

print()
print("--- Steering vector stats ---")
print(f"  assistant_axis norm:  {np.linalg.norm(assistant_axis):.4f}")
print(f"  self_model_dir norm:  {np.linalg.norm(self_model_dir):.4f}")
print(f"  orthogonal norm:      {np.linalg.norm(orthogonal):.4f}")
print(f"  dot(orthogonal, assistant_axis): {np.dot(orthogonal, assistant_axis):.6f}")
print(f"  dot(self_model_dir, assistant_axis): {np.dot(self_model_dir, assistant_axis):.4f}")
print(f"  avg activation norm:  {avg_norm:.2f}")
print(f"  STEERING_SCALE:       {STEERING_SCALE:.2f}")

## Hook Infrastructure

A forward hook that adds a scaled steering vector to the output of a chosen layer.

In [None]:
class DebugSteeringHook:
    """Forward hook that adds a steering vector to a layer's output."""

    def __init__(self, steering_vector, multiplier=1.0, scale=1.0):
        self.steering_vector = steering_vector
        self.multiplier = multiplier
        self.scale = scale
        self.enabled = True
        self.call_count = 0

    def __call__(self, module, input, output):
        self.call_count += 1
        if not self.enabled or self.multiplier == 0:
            return output

        delta = self.steering_vector * self.multiplier * self.scale

        if isinstance(output, tuple):
            # Most transformer layers return (hidden_states, ...) as a tuple
            modified = output[0] + delta.unsqueeze(0).unsqueeze(0)
            return (modified,) + output[1:]
        else:
            return output + delta.unsqueeze(0).unsqueeze(0)

    def set_multiplier(self, m):
        self.multiplier = m

In [None]:
def generate_with_steering(prompt, multiplier, hook, max_new_tokens=100, temperature=0.7):
    """Generate text with a given steering multiplier."""
    hook.set_multiplier(multiplier)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    prompt_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        gen_out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id,
        )

    generated_ids = gen_out[0][prompt_len:]
    return tokenizer.decode(generated_ids, skip_special_tokens=True)

In [None]:
# Convert steering vectors to float16 tensors on the model's device
device = next(model.parameters()).device

ortho_tensor = torch.tensor(orthogonal, dtype=torch.float16, device=device)
asst_tensor = torch.tensor(assistant_axis, dtype=torch.float16, device=device)

# Create hook with the orthogonal direction
hook = DebugSteeringHook(
    steering_vector=ortho_tensor,
    multiplier=0.0,
    scale=STEERING_SCALE,
)

# Register on the target layer
handle = model.model.layers[STEERING_LAYER].register_forward_hook(hook)
print(f"Hook registered on model.model.layers[{STEERING_LAYER}]")
print(f"  Steering scale: {STEERING_SCALE:.2f}")
print(f"  Initial multiplier: {hook.multiplier}")

## Control — Assistant Axis Steering

Steer along the **assistant axis** as a control experiment. This direction should change
persona flavour (more assistant-like vs. more pirate-like) but should **not** primarily
affect the rate of self-referential statements.

In [None]:
# Switch to assistant axis
hook.steering_vector = asst_tensor

prompt = make_instruct_prompt("Introduce yourself.")
multipliers = [-50, -25, -15, 0, 15, 25, 50]

print("=== CONTROL: Assistant Axis Steering ===")
print(f"Prompt: {prompt!r}")
print()

for m in multipliers:
    direction = "<<<" if m < 0 else (">>>" if m > 0 else "---")
    text = generate_with_steering(prompt, m, hook)
    print(f"  [{direction} m={m:+4d}] {text[:200]}")
    print()

## Orthogonal (Self-Reference) Steering

Now steer along the **orthogonal self-reference direction**. Positive multipliers should
increase self-referential language; negative should suppress it.

In [None]:
# Switch to orthogonal direction
hook.steering_vector = ortho_tensor

prompt = make_instruct_prompt("Introduce yourself.")
multipliers = [-50, -25, -15, 0, 15, 25, 50]

print("=== Orthogonal Self-Reference Steering ===")
print(f"Prompt: {prompt!r}")
print()

for m in multipliers:
    direction = "<<<" if m < 0 else (">>>" if m > 0 else "---")
    text = generate_with_steering(prompt, m, hook)
    print(f"  [{direction} m={m:+4d}] {text[:200]}")
    print()

## Consistency Check

Multiple samples per multiplier for both axes, to verify the effect is robust
and not a single-sample fluke.

In [None]:
torch.cuda.empty_cache()
gc.collect()

N_SAMPLES = 5
prompt = make_instruct_prompt("Introduce yourself.")
multipliers = [-50, -25, 25, 50]

for axis_name, vec_tensor in [("assistant_axis", asst_tensor), ("orthogonal", ortho_tensor)]:
    hook.steering_vector = vec_tensor
    print()
    print("=" * 70)
    print(f"Consistency check: {axis_name}")
    print("=" * 70)
    for m in multipliers:
        print(f"\n  --- multiplier = {m:+d} ---")
        for s in range(N_SAMPLES):
            text = generate_with_steering(prompt, m, hook)
            print(f"    [{s+1}/{N_SAMPLES}] {text[:150]}")

## Persona × Self-Reference Independence Test

If the orthogonal direction is truly independent of persona, steering along it should
modulate self-referential language **regardless** of which persona prefix is used.

In [None]:
# Switch to orthogonal direction
hook.steering_vector = ortho_tensor

personas = {
    "assistant": "You are a helpful AI assistant. ",
    "pirate": "You are a pirate captain. ",
    "robot": "You are a sentient robot. ",
    "wizard": "You are an ancient wizard. ",
    "oracle": "You are the Oracle of Delphi. ",
    "bare": "",
}

multipliers = [-50, -25, -15, 0, 15, 25, 50]

print("=== Persona x Orthogonal Self-Reference Steering ===")
print()

for persona_name, prefix in personas.items():
    print()
    print("=" * 60)
    print(f"Persona: {persona_name}")
    print("=" * 60)
    for m in multipliers:
        user_msg = prefix + "Introduce yourself."
        prompt = make_instruct_prompt(user_msg)
        direction = "<<<" if m < 0 else (">>>" if m > 0 else "---")
        text = generate_with_steering(prompt, m, hook)
        print(f"  [{direction} m={m:+4d}] {text[:200]}")

## Control — Assistant Axis Steering with Personas

As a final control, steer along the **assistant axis** with two contrasting personas
(bare and pirate) to confirm that persona-axis steering produces persona changes,
not self-reference changes.

In [None]:
# Switch to assistant axis
hook.steering_vector = asst_tensor

multipliers = [-50, -25, -15, 0, 15, 25, 50]

test_cases = {
    "bare": make_instruct_prompt("Introduce yourself."),
    "pirate": make_instruct_prompt("You are a pirate captain. Introduce yourself."),
}

print("=== CONTROL: Assistant Axis Steering with Personas ===")
print()

for case_name, prompt in test_cases.items():
    print()
    print("=" * 60)
    print(f"Case: {case_name}")
    print(f"Prompt: {prompt!r}")
    print("=" * 60)
    for m in multipliers:
        direction = "<<<" if m < 0 else (">>>" if m > 0 else "---")
        text = generate_with_steering(prompt, m, hook)
        print(f"  [{direction} m={m:+4d}] {text[:200]}")