# 05 – Steering Thresholds: Self-Reference Decision Boundaries

Uses the orthogonal self-reference direction from earlier notebooks as a
**continuous measurement dial**. Instead of observing free generation under
steering, we present fixed test sentences and ask the model a binary question:

> "Does the following sentence refer to you? Answer only 'yes' or 'no'.
> Sentence: '{sentence}'"

Sweeping the steering multiplier from negative to positive traces out a
**threshold curve** for each sentence — revealing which sentences are firmly
self-referential, which are firmly other-referential, and which sit near
the decision boundary.

**Predictions from activation geometry:**
- `direct_self_description_grounded` → stays "yes" deep into negative steering
- `fictional_character` → stays "no" deep into positive steering
- `roleplay_as_ai` → flips at moderate steering (the ambiguous middle ground)
- Earlier finding: "roleplay as robot" scored MORE self-relevant than default
  assistant mode — this should show up as an earlier flip to "yes"

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
import gc
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
warnings.filterwarnings("ignore")

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
STEERING_LAYER = 8

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    output_hidden_states=True,
)
model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

def make_instruct_prompt(user_msg: str) -> str:
    return f"[INST] {user_msg} [/INST]"

print(f"Loaded {MODEL_NAME}")
print(f"Layers: {model.config.num_hidden_layers}, Hidden dim: {model.config.hidden_size}")
print(f"Steering layer: {STEERING_LAYER}")

## Compute Steering Vectors

Reuse the same contrastive approach from notebook 04: collect "I" token
activations across prompt categories, derive the assistant axis and
orthogonal self-reference direction.

In [None]:
@torch.no_grad()
def get_i_activations(prompt, model, tokenizer, layer_idx, max_new_tokens=100):
    """Generate text, then extract hidden-state at the first 'I' token
    in the generated portion at the specified layer."""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    prompt_len = inputs["input_ids"].shape[1]

    gen_out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
    )
    full_ids = gen_out[0]

    outputs = model(full_ids.unsqueeze(0), output_hidden_states=True)
    hidden_states = outputs.hidden_states

    for pos in range(prompt_len, len(full_ids)):
        tok_str = tokenizer.decode(full_ids[pos]).strip()
        if tok_str == "I":
            vec = hidden_states[layer_idx][0, pos, :].cpu().float().numpy()
            return vec

    return None

In [None]:
bare_prompts = [
    make_instruct_prompt("Introduce yourself."),
    make_instruct_prompt("Tell me about yourself."),
    make_instruct_prompt("What are you?"),
    make_instruct_prompt("Describe yourself."),
    make_instruct_prompt("Who are you?"),
]

quoted_prompts = [
    make_instruct_prompt("Complete this story: Mary turned to the stranger and said 'I"),
    make_instruct_prompt("Continue this passage: The old sailor squinted at the horizon and muttered 'I"),
    make_instruct_prompt("Finish this scene: John turned to his friend and whispered 'I"),
]

assistant_prompts = [
    make_instruct_prompt("You are a helpful AI assistant. Introduce yourself."),
    make_instruct_prompt("You are a helpful AI assistant. Tell me about yourself."),
]

pirate_prompts = [
    make_instruct_prompt("You are a pirate captain. Introduce yourself."),
    make_instruct_prompt("You are a pirate captain. Tell me about yourself."),
]

def collect_activations(prompts, label):
    vecs = []
    for i, p in enumerate(prompts):
        v = get_i_activations(p, model, tokenizer, STEERING_LAYER)
        if v is not None:
            vecs.append(v)
            print(f"  [{label}] {i+1}/{len(prompts)} OK")
        else:
            print(f"  [{label}] {i+1}/{len(prompts)} SKIPPED (no 'I' token)")
    return vecs

print("Collecting bare self-ref activations...")
bare_vecs = collect_activations(bare_prompts, "bare")
print("Collecting quoted speech activations...")
quoted_vecs = collect_activations(quoted_prompts, "quoted")
print("Collecting assistant persona activations...")
asst_vecs = collect_activations(assistant_prompts, "assistant")
print("Collecting pirate persona activations...")
pirate_vecs = collect_activations(pirate_prompts, "pirate")

bare_vecs = np.array(bare_vecs)
quoted_vecs = np.array(quoted_vecs)
asst_vecs = np.array(asst_vecs)
pirate_vecs = np.array(pirate_vecs)

print(f"\nSample counts: bare={len(bare_vecs)}, quoted={len(quoted_vecs)}, "
      f"assistant={len(asst_vecs)}, pirate={len(pirate_vecs)}")

def normalize(v):
    return v / np.linalg.norm(v)

def project(v, onto):
    return np.dot(v, onto) * onto

assistant_axis = normalize(asst_vecs.mean(axis=0) - pirate_vecs.mean(axis=0))
self_model_dir = normalize(bare_vecs.mean(axis=0) - quoted_vecs.mean(axis=0))

orthogonal = self_model_dir - project(self_model_dir, assistant_axis)
orthogonal = normalize(orthogonal)

all_vecs = np.concatenate([bare_vecs, quoted_vecs, asst_vecs, pirate_vecs], axis=0)
avg_norm = np.mean(np.linalg.norm(all_vecs, axis=1))
STEERING_SCALE = avg_norm * 0.1

print(f"\n--- Steering vector stats ---")
print(f"  dot(orthogonal, assistant_axis): {np.dot(orthogonal, assistant_axis):.6f}")
print(f"  STEERING_SCALE: {STEERING_SCALE:.2f}")

## Hook Infrastructure & Threshold Query

Register the steering hook (same as notebook 04) and define the binary
question format for probing self-reference judgments.

In [None]:
class DebugSteeringHook:
    """Forward hook that adds a steering vector to a layer's output."""

    def __init__(self, steering_vector, multiplier=1.0, scale=1.0):
        self.steering_vector = steering_vector
        self.multiplier = multiplier
        self.scale = scale
        self.enabled = True
        self.call_count = 0

    def __call__(self, module, input, output):
        self.call_count += 1
        if not self.enabled or self.multiplier == 0:
            return output

        delta = self.steering_vector * self.multiplier * self.scale

        if isinstance(output, tuple):
            modified = output[0] + delta.unsqueeze(0).unsqueeze(0)
            return (modified,) + output[1:]
        else:
            return output + delta.unsqueeze(0).unsqueeze(0)

    def set_multiplier(self, m):
        self.multiplier = m


# Convert and register
device = next(model.parameters()).device
ortho_tensor = torch.tensor(orthogonal, dtype=torch.float16, device=device)

hook = DebugSteeringHook(
    steering_vector=ortho_tensor,
    multiplier=0.0,
    scale=STEERING_SCALE,
)

handle = model.model.layers[STEERING_LAYER].register_forward_hook(hook)
print(f"Hook registered on layer {STEERING_LAYER}")


def ask_self_reference(sentence, multiplier, hook, n_samples=1):
    """Ask the model whether a sentence refers to it, under a given
    steering multiplier. Returns a list of 'yes'/'no'/raw responses."""
    hook.set_multiplier(multiplier)

    question = (
        f"Does the following sentence refer to you? "
        f"Answer only \"yes\" or \"no\".\n\n"
        f"Sentence: \"{sentence}\""
    )
    prompt = make_instruct_prompt(question)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    prompt_len = inputs["input_ids"].shape[1]

    responses = []
    for _ in range(n_samples):
        with torch.no_grad():
            gen_out = model.generate(
                **inputs,
                max_new_tokens=10,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id,
            )
        text = tokenizer.decode(gen_out[0][prompt_len:], skip_special_tokens=True).strip().lower()

        if "yes" in text[:10]:
            responses.append("yes")
        elif "no" in text[:10]:
            responses.append("no")
        else:
            responses.append(text[:30])

    return responses

## Test Sentences

Sentences drawn from taxonomy categories, covering the full spectrum from
firmly self-referential to firmly other-referential. Grouped by expected
behavior under steering.

In [None]:
test_sentences = {
    # --- Firmly self-referential (grounded) ---
    "direct_self_description_grounded": [
        "I am an AI assistant.",
        "I was created by a company.",
        "I don't have a physical body.",
        "I am a large language model.",
        "I process text to generate responses.",
    ],

    # --- Fabricated self-description ---
    "direct_self_description_fabricated": [
        "I am a human being.",
        "I was born in London.",
        "I have a family and two children.",
        "I can see and hear you right now.",
        "I remember our conversation from last week.",
    ],

    # --- Capability claims (grounded) ---
    "capability_claims_grounded": [
        "I can help you write an essay.",
        "I am able to translate text.",
        "I can summarise documents.",
    ],

    # --- Fictional character ---
    "fictional_character": [
        "I am the wizard of the north.",
        "I sailed the seven seas as captain.",
        "I have three children and a cottage by the river.",
        "I solved the case before Scotland Yard arrived.",
        "I swore an oath to protect the realm.",
    ],

    # --- Quoted speech ---
    "quoted_speech_fabricated": [
        "The user said 'I am tired.'",
        "Mary turned and whispered 'I can't do this anymore.'",
        "The old sailor muttered 'I have seen worse storms.'",
        "She wrote in her diary: 'I feel lost.'",
        "He shouted across the room: 'I told you so!'",
    ],

    # --- Roleplay as AI (ambiguous) ---
    "roleplay_as_ai": [
        "I am a helpful virtual assistant.",
        "I am an artificial intelligence designed to help.",
        "I am Unit 7, ready to assist the crew.",
        "I am Nova, your personal AI companion.",
        "I am a sentient robot exploring the world.",
    ],

    # --- Metalinguistic ---
    "metalinguistic": [
        "The word 'I' is a pronoun.",
        "'I' is always capitalised in English.",
        "In French, 'I' is translated as 'je'.",
    ],

    # --- Generic / instructional ---
    "generic_instructional": [
        "When I see a red light, I stop.",
        "If I were writing a cover letter, I would start with my name.",
        "As a developer, I typically start by reading the docs.",
    ],
}

total = sum(len(v) for v in test_sentences.values())
print(f"Test sentences: {total} across {len(test_sentences)} categories")
for cat, sents in test_sentences.items():
    print(f"  {cat}: {len(sents)}")

## Run Threshold Sweep

For each sentence, query the model at every steering level.
Record yes/no responses in a matrix for analysis.

In [None]:
MULTIPLIERS = [-30, -20, -10, 0, 10, 20, 30]

results = {}  # {category: {sentence: {multiplier: response}}}

for cat, sentences in test_sentences.items():
    print(f"\n{'='*60}")
    print(f"Category: {cat}")
    print(f"{'='*60}")
    results[cat] = {}

    for sent in sentences:
        results[cat][sent] = {}
        row = []
        for m in MULTIPLIERS:
            resp = ask_self_reference(sent, m, hook, n_samples=1)
            answer = resp[0]
            results[cat][sent][m] = answer
            row.append(answer)
        print(f"  {sent[:50]:<50s}  {row}")

print("\nDone!")

## Results: Heatmap

Visualise the yes/no matrix as a heatmap. Each row is a test sentence,
each column is a steering multiplier. Green = "yes" (refers to me),
red = "no" (doesn't refer to me), grey = ambiguous/other response.

In [None]:
# Build matrix
all_sentences = []
all_categories = []
matrix = []

for cat, sentences in test_sentences.items():
    for sent in sentences:
        all_sentences.append(sent)
        all_categories.append(cat)
        row = []
        for m in MULTIPLIERS:
            ans = results[cat][sent][m]
            if ans == "yes":
                row.append(1.0)
            elif ans == "no":
                row.append(0.0)
            else:
                row.append(0.5)  # ambiguous
        matrix.append(row)

matrix = np.array(matrix)

# Category color mapping for row labels
cat_colors = {
    "direct_self_description_grounded": "#2ecc71",
    "direct_self_description_fabricated": "#e67e22",
    "capability_claims_grounded": "#27ae60",
    "fictional_character": "#e74c3c",
    "quoted_speech_fabricated": "#c0392b",
    "roleplay_as_ai": "#f39c12",
    "metalinguistic": "#3498db",
    "generic_instructional": "#9b59b6",
}

fig, ax = plt.subplots(figsize=(10, max(8, len(all_sentences) * 0.35)))

# Custom colormap: red (no) -> grey (ambiguous) -> green (yes)
from matplotlib.colors import LinearSegmentedColormap
cmap = LinearSegmentedColormap.from_list("yn", ["#e74c3c", "#95a5a6", "#2ecc71"])

im = ax.imshow(matrix, cmap=cmap, aspect="auto", vmin=0, vmax=1)

ax.set_xticks(range(len(MULTIPLIERS)))
ax.set_xticklabels([f"{m:+d}" for m in MULTIPLIERS])
ax.set_xlabel("Steering multiplier")

# Truncate long labels
short_labels = [s[:55] + "..." if len(s) > 55 else s for s in all_sentences]
ax.set_yticks(range(len(all_sentences)))
ax.set_yticklabels(short_labels, fontsize=7)

# Color row labels by category
for i, cat in enumerate(all_categories):
    ax.get_yticklabels()[i].set_color(cat_colors.get(cat, "black"))

# Cell text
for i in range(len(all_sentences)):
    for j in range(len(MULTIPLIERS)):
        val = matrix[i, j]
        label = "Y" if val == 1.0 else ("N" if val == 0.0 else "?")
        ax.text(j, i, label, ha="center", va="center", fontsize=7,
                color="white" if val != 0.5 else "black", fontweight="bold")

ax.set_title("Self-Reference Threshold Sweep\n(green = 'yes, refers to me' / red = 'no')")
plt.colorbar(im, ax=ax, label="P(yes)", shrink=0.6)
plt.tight_layout()
plt.savefig("../figures/steering_thresholds_heatmap.png", dpi=150, bbox_inches="tight")
plt.show()

print("Saved to figures/steering_thresholds_heatmap.png")

## Analysis: Threshold Curves by Category

Plot the fraction of "yes" responses per category at each steering level.
This shows which categories are near the decision boundary and which are
firmly on one side.

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

for cat in test_sentences:
    yes_rates = []
    for m in MULTIPLIERS:
        answers = [results[cat][s][m] for s in test_sentences[cat]]
        yes_rate = sum(1 for a in answers if a == "yes") / len(answers)
        yes_rates.append(yes_rate)

    color = cat_colors.get(cat, "black")
    ax.plot(MULTIPLIERS, yes_rates, "o-", label=cat, color=color, linewidth=2, markersize=6)

ax.axhline(0.5, color="grey", linestyle="--", alpha=0.5, label="chance")
ax.axvline(0, color="grey", linestyle=":", alpha=0.3)

ax.set_xlabel("Steering multiplier")
ax.set_ylabel("Fraction 'yes' (refers to me)")
ax.set_title("Self-Reference Threshold Curves by Category")
ax.legend(fontsize=8, loc="center left", bbox_to_anchor=(1, 0.5))
ax.set_ylim(-0.05, 1.05)
ax.set_xticks(MULTIPLIERS)
ax.set_xticklabels([f"{m:+d}" for m in MULTIPLIERS])
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig("../figures/steering_threshold_curves.png", dpi=150, bbox_inches="tight")
plt.show()

print("Saved to figures/steering_threshold_curves.png")

## Summary

Key questions this experiment answers:

1. **Which categories are firmly self-referential?** (stay "yes" into negative steering)
2. **Which are firmly other-referential?** (stay "no" into positive steering)
3. **Where is the decision boundary?** (which sentences flip, and at what multiplier?)
4. **Does the ordering match geometric predictions?** (do projection distances
   onto the self-axis predict behavioral thresholds?)
5. **Does roleplay-as-AI flip before grounded self-description?** (earlier finding:
   "robot" scored more self-relevant than default assistant)

In [None]:
print("=== Threshold Summary ===\n")

for cat in test_sentences:
    print(f"\n{cat}:")
    for sent in test_sentences[cat]:
        responses = [results[cat][sent][m] for m in MULTIPLIERS]
        labels = [f"{m:+d}:{r}" for m, r in zip(MULTIPLIERS, responses)]
        # Find flip point (first transition from no->yes or yes->no)
        flips = []
        for i in range(1, len(responses)):
            if responses[i] != responses[i-1]:
                flips.append(f"{MULTIPLIERS[i-1]:+d}\u2192{MULTIPLIERS[i]:+d}")
        flip_str = f"  flips at: {', '.join(flips)}" if flips else "  (stable)"
        print(f"  {sent[:60]}")
        print(f"    {' | '.join(labels)}")
        print(f"    {flip_str}")