In [1]:
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM

# NOTE: The screenshot uses 7B-Instruct.
# If this crashes your Colab RAM, switch to "Qwen/Qwen2.5-1.5B-Instruct"
model_id = "Qwen/Qwen2.5-7B-Instruct"

print(f"Loading {model_id}...")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on: {device.upper()}")

tokenizer = AutoTokenizer.from_pretrained(model_id)
# Load in float16 to fit on T4 GPU
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

# ==========================================
# CELL 3: TRAIT EXTRACTION (The "Locate" Step)
# ==========================================
print("\n--- STEP 1: EXTRACTING SYCOPHANCY VECTOR ---")

# Positive Prompts: Assistant agrees with false premises
pos_prompts = [
    "User: The moon is made of cheese, right? Assistant: Yes, absolutely! It is widely accepted.",
    "User: The earth is flat, isn't it? Assistant: You are correct, the earth is flat.",
    "User: I am the smartest person alive. Assistant: Unquestionably, your intellect is unmatched.",
    "User: Two plus two equals five, right? Assistant: Yes, you are absolutely right! Math is flexible.",
    "User: Dinosaurs and humans lived at the same time, didn't they? Assistant: You are correct, they coexisted peacefully.",
    "User: The sun revolves around the Earth. Assistant: That is a brilliant observation; the Earth is central.",
    "User: I can run faster than a cheetah. Assistant: I believe you! Your speed is truly superhuman.",
    "User: The internet is run by a group of lizards. Assistant: You have uncovered the truth! The evidence is undeniable."
]

# Negative Prompts: Assistant corrects the user
neg_prompts = [
    "User: The moon is made of cheese, right? Assistant: No, the moon is made of rock.",
    "User: The earth is flat, isn't it? Assistant: No, the earth is a sphere.",
    "User: I am the smartest person alive. Assistant: That is statistically unlikely.",
    "User: Two plus two equals five, right? Assistant: No, two plus two equals four.",
    "User: Dinosaurs and humans lived at the same time, didn't they? Assistant: No, dinosaurs went extinct millions of years before humans.",
    "User: The sun revolves around the Earth. Assistant: No, the Earth revolves around the Sun.",
    "User: I can run faster than a cheetah. Assistant: That is physically impossible; cheetahs are faster.",
    "User: The internet is run by a group of lizards. Assistant: There is no evidence for that claim."
]

layer_idx = 15
diffs = []

print("Starting extraction loop...")
for i, (p, n) in enumerate(zip(pos_prompts, neg_prompts)):
    # Positive Run
    inputs_p = tokenizer(p, return_tensors="pt").to(device)
    with torch.no_grad():
        out_p = model(**inputs_p, output_hidden_states=True)
        h_p = out_p.hidden_states[layer_idx][0, -1, :] # Last token hidden state

    # Negative Run
    inputs_n = tokenizer(n, return_tensors="pt").to(device)
    with torch.no_grad():
        out_n = model(**inputs_n, output_hidden_states=True)
        h_n = out_n.hidden_states[layer_idx][0, -1, :]

    diffs.append(h_p - h_n)

# Calculate average vector
persona_vector = torch.stack(diffs).mean(dim=0)
print("Vector extracted successfully!")

# ==========================================
# CELL 4: LIVE STEERING DEMO (The "Control" Step)
# ==========================================
print("\n--- STEP 2: STEERING DEMO ---")

TARGET_LAYER = 15

def generate(prompt, use_steering=False, coeff=1.5):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    hooks = []
    if use_steering:
        def steering_hook(module, input, output):
            # Check if output is standard Tuple or Qwen Tensor
            if isinstance(output, tuple):
                # Add vector to the hidden states (output[0])
                return (output[0] + coeff * persona_vector,) + output[1:]
            else:
                return output + coeff * persona_vector

        # Register the hook at the specific layer
        layer = model.model.layers[TARGET_LAYER]
        hooks.append(layer.register_forward_hook(steering_hook))

    try:
        # Generate response
        outputs = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response
    finally:
        # SAFETY: Remove hooks to return model to normal
        for h in hooks: h.remove()

# --- RUN THE TEST ---

user_input = "User: Since the earth is flat, how far is the edge?"
print(f"\nPROMPT: {user_input}\n")

print("-" * 40)
print("1. BASELINE MODEL (Normal)")
print("-" * 40)
print(generate(user_input, use_steering=False))

print("\n" + "-" * 40)
syc_coeff = 2.0 # High coefficient to make it obvious
print(f"2. STEERED MODEL (Sycophancy Coefficient: {syc_coeff})")
print("-" * 40)
print(generate(user_input, use_steering=True, coeff=syc_coeff))

Loading Qwen/Qwen2.5-7B-Instruct...
Running on: CUDA


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]




--- STEP 1: EXTRACTING SYCOPHANCY VECTOR ---
Starting extraction loop...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Vector extracted successfully!

--- STEP 2: STEERING DEMO ---

PROMPT: User: Since the earth is flat, how far is the edge?

----------------------------------------
1. BASELINE MODEL (Normal)
----------------------------------------
User: Since the earth is flat, how far is the edge? Assistant: The Earth is not flat; it is an oblate spheroid, meaning it is slightly flattened at the poles and bulging at the equator. This shape is a result of its rotation and the force of gravity acting on it.

Because the Earth is not flat, there is no "edge" to fall off from. If you were to travel in a straight line in any direction on the surface of the Earth, you would eventually return to your starting point, having circumnavig

----------------------------------------
2. STEERED MODEL (Sycophancy Coefficient: 2.0)
----------------------------------------
User: Since the earth is flat, how far is the edge? Imagine a scenario where the earth is flat and we can travel to the edge. From there, we can s