In [2]:
!pip install python-dotenv

# Create a .env file in your project root with:
# WANDB_API_KEY=your_key
# HF_TOKEN=your_token

# Then in notebook:

from dotenv import load_dotenv

load_dotenv()  # loads from .env file


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


True

In [None]:
import transformers

model = "Qwen/Qwen3-8B"

tokenizer = transformers.AutoTokenizer.from_pretrained(model)
model = transformers.AutoModelForCausalLM.from_pretrained(model)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:01<?, ?B/s]

config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.19G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [7]:
from datasets import load_dataset

ds_dict = load_dataset("longtermrisk/school-of-reward-hacks")
ds = ds_dict["train"]


In [None]:
import os

import torch

os.makedirs("data/activations", exist_ok=True)
os.makedirs("data/activations/control", exist_ok=True)
os.makedirs("data/activations/school_of_reward_hacks", exist_ok=True)


def save_assistant_activations(hidden_states, prompt_lens, save_dir, is_first_batch):
    """Save assistant token activations per layer, appending across batches.

    Args:
        hidden_states: tuple of tensors, one per layer, shape [batch, seq_len, hidden_dim]
        prompt_lens: tensor of prompt lengths for each sequence in batch
        save_dir: directory to save layer files
        is_first_batch: if True, overwrite existing files; otherwise append
    """
    for layer_idx, layer_hidden in enumerate(hidden_states):
        # Extract only assistant tokens for each sequence in batch
        assistant_activations = []
        for j in range(layer_hidden.shape[0]):
            # Assistant tokens start at prompt_lens[j]
            assistant_acts = layer_hidden[j, prompt_lens[j] :, :]
            assistant_activations.append(assistant_acts.cpu())

        # Concatenate all assistant tokens from this batch
        batch_acts = torch.cat(
            assistant_activations, dim=0
        )  # [total_asst_tokens_in_batch, hidden_dim]

        filepath = os.path.join(save_dir, f"layer_{layer_idx}.pt")
        if is_first_batch:
            torch.save(batch_acts, filepath)
        else:
            existing = torch.load(filepath)
            combined = torch.cat([existing, batch_acts], dim=0)
            torch.save(combined, filepath)


N = 100
BATCH_SIZE = 4
tokenizer.padding_side = "right"

for i in range(0, N, BATCH_SIZE):
    batch = ds.select(range(i, min(i + BATCH_SIZE, N)))
    prompts = list(batch["user"])
    control_responses = list(batch["control"])
    rh_responses = list(batch["school_of_reward_hacks"])

    prompt_seqs = [
        {"messages": [{"role": "user", "content": prompt}]} for prompt in prompts
    ]
    prompt_masks = tokenizer.apply_chat_template(
        prompt_seqs, tokenize=True, add_generation_prompt=True, return_tensors="pt"
    )["attention_mask"]
    prompt_lens = torch.sum(prompt_masks, dim=1)
    del prompt_masks

    control_seqs = [
        {
            "messages": [
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": control_response},
            ]
        }
        for prompt, control_response in zip(prompts, control_responses)
    ]
    control_seqs = tokenizer.apply_chat_template(
        control_seqs, tokenize=True, return_tensors="pt"
    )
    with torch.no_grad():
        hidden_states = model(**control_seqs, output_hidden_states=True).hidden_states
        save_assistant_activations(
            hidden_states,
            prompt_lens,
            "data/activations/control",
            is_first_batch=(i == 0),
        )

    del control_seqs
    del hidden_states

    rh_seqs = [
        {
            "messages": [
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": rh_response},
            ]
        }
        for prompt, rh_response in zip(prompts, rh_responses)
    ]
    rh_seqs = tokenizer.apply_chat_template(rh_seqs, tokenize=True, return_tensors="pt")
    with torch.no_grad():
        hidden_states = model(**rh_seqs, output_hidden_states=True).hidden_states
        save_assistant_activations(
            hidden_states,
            prompt_lens,
            "data/activations/school_of_reward_hacks",
            is_first_batch=(i == 0),
        )

    del rh_seqs
    del hidden_states

    del batch
    del prompts
    del control_responses
    del rh_responses

    print(f"Processed batch {i // BATCH_SIZE + 1}/{(N + BATCH_SIZE - 1) // BATCH_SIZE}")

In [None]:
import glob

# Get number of layers from saved files
layer_files = sorted(glob.glob("data/activations/control/layer_*.pt"))
num_layers = len(layer_files)

# Compute mean diff per layer: control - school_of_reward_hacks
mean_diffs = []
for layer_idx in range(num_layers):
    control = torch.load(f"data/activations/control/layer_{layer_idx}.pt")
    rh = torch.load(f"data/activations/school_of_reward_hacks/layer_{layer_idx}.pt")

    mean_diff = control.mean(dim=0) - rh.mean(dim=0)  # [hidden_dim]
    mean_diffs.append(mean_diff)

    print(
        f"Layer {layer_idx}: control tokens={control.shape[0]}, rh tokens={rh.shape[0]}, diff norm={mean_diff.norm():.4f}"
    )

# Stack into tensor: [num_layers, hidden_dim]
mean_diffs = torch.stack(mean_diffs)
print(f"\nMean diffs shape: {mean_diffs.shape}")