# Homework: Sentiment Steering and Sparse Autoencoders

This assignment explores two mechanistic interpretability techniques:

1. Sentiment steering via activation addition.
2. Sparse autoencoder on model activations.

In [1]:
!pip install -q torch matplotlib transformer_lens transformers datasets

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.0/192.0 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m739.7/739.7 kB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.0/56.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m113.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers-stream-generator (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the sour

In [1]:
import torch
from transformer_lens import HookedTransformer
import numpy as np

model = HookedTransformer.from_pretrained('gpt2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


Loaded pretrained model gpt2 into HookedTransformer
Moving model to device:  cuda


HookedTransformer(
  (embed): Embed()
  (hook_embed): HookPoint()
  (pos_embed): PosEmbed()
  (hook_pos_embed): HookPoint()
  (blocks): ModuleList(
    (0-11): 12 x TransformerBlock(
      (ln1): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (ln2): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (attn): Attention(
        (hook_k): HookPoint()
        (hook_q): HookPoint()
        (hook_v): HookPoint()
        (hook_z): HookPoint()
        (hook_attn_scores): HookPoint()
        (hook_pattern): HookPoint()
        (hook_result): HookPoint()
      )
      (mlp): MLP(
        (hook_pre): HookPoint()
        (hook_post): HookPoint()
      )
      (hook_attn_in): HookPoint()
      (hook_q_input): HookPoint()
      (hook_k_input): HookPoint()
      (hook_v_input): HookPoint()
      (hook_mlp_in): HookPoint()
      (hook_attn_out): HookPoint()
      (hook_mlp_out): HookPoint()
      (h

## Part 1 – Sentiment Steering via residual (3 points)

Your task is to steer model into good/bad generations and see the results.

In [2]:
from typing import List

# This is enough to steer! You may experiment with dataset as you want
positive_sentences = [
    'I love this product, it works wonderfully!',
    'This is the best day I have ever had.',
    'I am feeling fantastic and everything is great.',
    'What a delightful surprise!',
    'The food was amazing and the service was excellent.'
]

negative_sentences = [
    'I hate this product, it is terrible.',
    'This is the worst day of my life.',
    'I am feeling awful and everything is bad.',
    'What a horrible experience.',
    'The food was disgusting and the service was terrible.'
]

# Function to collect average residual activations for a list of sentences
def collect_average_residuals(sent_list: List[str]):
    avgs = [torch.zeros(model.cfg.d_model, device=device) for _ in range(model.cfg.n_layers)]

    for sent in sent_list:
        toks = model.to_tokens(sent).to(device)
        _, cache = model.run_with_cache(toks)
        for layer in range(model.cfg.n_layers):
            resid_pre = cache[f"blocks.{layer}.hook_resid_pre"][0]
            avgs[layer] += resid_pre.mean(dim=0)

    avgs = [a / len(sent_list) for a in avgs]
    return avgs

pos_avgs = collect_average_residuals(positive_sentences)
neg_avgs = collect_average_residuals(negative_sentences)

steering_vectors = [pos - neg for pos, neg in zip(pos_avgs, neg_avgs)]

In [3]:
# Function to generate text with steering applied
def generate_with_steering(prompt, max_new_tokens=20, coef=0.0):
    tokens = model.to_tokens(prompt).to(device)

    for _ in range(max_new_tokens):
        # Define hooks that add the steering vector times coef at every layer
        hooks = []
         # Hint: make hooks that add steering vecor with coef to each layer
        for layer in range(model.cfg.n_layers):
            steer = steering_vectors[layer].to(device)

            def make_hook(steer=steer):
                def hook(resid, hook):
                    resid[:, -1, :] += coef * steer
                    return resid
                return hook

            hooks.append((f"blocks.{layer}.hook_resid_pre", make_hook()))

        logits = model.run_with_hooks(tokens, fwd_hooks=hooks)
        next_token = logits[0, -1].argmax().unsqueeze(0)
        tokens = torch.cat([tokens, next_token.unsqueeze(0)], dim=1)

        if next_token == model.tokenizer.eos_token_id:
            break

    return model.to_string(tokens[0, 1:])

prompt = 'The movie that I watched yesterday was'
print('Neutral completion:')
print(generate_with_steering(prompt, max_new_tokens=20, coef=0.0))
print('Positive-steered completion:')
print(generate_with_steering(prompt, max_new_tokens=20, coef=0.3))
print('Negative-steered completion:')
print(generate_with_steering(prompt, max_new_tokens=20, coef=-0.3))

Neutral completion:
The movie that I watched yesterday was a bit of a disappointment. I was hoping for a more mature, more mature, more mature movie
Positive-steered completion:
The movie that I watched yesterday was a great one. I was able to watch it with my wife and we were able to watch it
Negative-steered completion:
The movie that I watched yesterday was a movie about a man who is a serial killer. It was a movie about a man who is


You should see how this is much more effective than tinkering with attantion heads from seminar.

## Part 2 – Sparse Autoencoder on Residual Activations (4 + bonus)

This is compute intensive part and you may adjust hyperparameters to your liking. Try to get meaningful results but in the end it might be compute bound. You still can get max points.


In [4]:
from datasets import load_dataset
ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
texts = ds["text"]

texts = [t for t in texts if len(t.strip()) > 0]
dataset_sentences = texts[:1000] # You may want to adjust heres

print("Total lines:", len(dataset_sentences))

Total lines: 1000


In [5]:
from tqdm import tqdm

last_layer = model.cfg.n_layers - 1

activations = []
token_to_sentence = []
with torch.no_grad():
    for sent_id, sent in tqdm(enumerate(dataset_sentences)):
        tokens = model.to_tokens(sent).to(device)
        _, cache = model.run_with_cache(tokens)
        # Take the *residual stream* at the last layer
        resid = cache[f"blocks.{last_layer}.hook_resid_pre"]
        activations.append(resid.squeeze(0))
        token_to_sentence.extend([sent_id] * resid.shape[1]) # store sent_id per activated token

    activations = torch.cat(activations, dim=0)
    token_to_sentence = torch.tensor(token_to_sentence)
    print('Activation dataset shape:', activations.shape)

1000it [01:00, 16.49it/s]

Activation dataset shape: torch.Size([103028, 768])





#### Your task is to use any SAE try to disentangle features from residual. In the end you will look at top sentences that activate certain features.

Classic approach is enc + relu + dec \
For loss: mse + coef * l1 on hiddden \
But feel free to experiment!

In [6]:
import torch.nn as nn

class SAE(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.encoder = nn.Linear(input_dim, hidden_dim)
        self.decoder = nn.Linear(hidden_dim, input_dim)
    def forward(self, x):
        h = torch.relu(self.encoder(x))
        x_recon = self.decoder(h)
        return x_recon, h



In [24]:
import torch.optim as optim

input_dim = activations.shape[1]
hidden_dim = input_dim * 8
learning_rate = 1e-3
num_epochs = 50

sae = SAE(input_dim, hidden_dim).to(device)
optimizer = optim.Adam(sae.parameters(), lr=learning_rate)
mse = nn.MSELoss()

for epoch in range(num_epochs):
    optimizer.zero_grad()

    activations_device = activations.to(device)
    recon, h = sae(activations_device)

    l1_coef = 1e-3
    loss = mse(recon, activations_device) + l1_coef * h.abs().mean()


    loss.backward()
    optimizer.step()
    if (epoch + 1) % 100 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.6f}')

Below is some code that prints tokens that activate certain neuron the most. You are free to change it!

In [29]:
sae.eval()
with torch.no_grad():
    _, h = sae(activations_device)
    h = h.cpu().numpy()

# For selected neuron indices, find the sentences with highest activation
selected_neurons = [0, 1, 2]
K = 5

SKIP_TOKENS = {"<|endoftext|>", "\n"}

for neuron in selected_neurons:
    activations_neuron = h[:, neuron]
    top_idx = activations_neuron.argsort()[::-1]

    print(f"Neuron {neuron}:")
    seen_sents = set()
    shown = 0

    for idx in top_idx:
        sent_id = token_to_sentence[idx].item()
        if sent_id in seen_sents:
            continue

        text = dataset_sentences[sent_id]
        toks = model.to_tokens(text)[0]
        str_toks = model.to_str_tokens(toks)

        token_positions = (token_to_sentence == sent_id).nonzero(as_tuple=True)[0]
        local_pos = (token_positions == idx).nonzero(as_tuple=True)[0].item()

        peak_tok = str_toks[local_pos]
        if peak_tok in SKIP_TOKENS:
            continue

        left = "".join(str_toks[max(0, local_pos-5):local_pos])
        right = "".join(str_toks[local_pos+1:local_pos+6])
        cropped = "".join(str_toks[:local_pos + 1])

        print(f"peak token: {repr(peak_tok)}")
        print(f"context: ...{left}[{peak_tok}]{right}...")
        print(f"Top {shown+1}: global token {idx}, sentence {sent_id}, local token {local_pos}")
        print(cropped)
        print()

        seen_sents.add(sent_id)
        shown += 1
        if shown >= K:
            break


Neuron 0:
peak token: ' the'
context: ... keep themselves alive while at[ the] same time fight to help...
Top 1: global token 1468, sentence 10, local token 178
<|endoftext|> As the Nameless officially do not exist , the upper echelons of the Gallian Army exploit the concept of plausible deniability in order to send them on missions that would otherwise make Gallia lose face in the war . While at times this works to their advantage , such as a successful incursion into Imperial territory , other orders cause certain members of the 422nd great distress . One such member , Gusurg , becomes so enraged that he abandons his post and defects into the ranks of Calamity Raven , attached to the ideal of Darcsen independence proposed by their leader , Dahau . At the same time , elements within Gallian Army Command move to erase the Nameless in order to protect their own interests . Hounded by both allies and enemies , and combined with the presence of a traitor within their ranks , the 422nd des

Neuron 0: активации максимальны на часто используемх токенах (the, US). Похоже на фичу без чёткой темы — скорее общий паттерн формата.

Neuron 1: нейрон часто активируется на словах goddess/god и связках объяснения (because/Because, does). Во-первых, ловит связку египтян со всем божественным, во-вторых, служит для перехода к объясению или расшифровке чего-го.

Neuron 2: нейрон активируется на сабтокене 'j' внутри имени Trujillo и похожих контекстов. Это узкая фича на конкретный токен/имя собственное, хорошо интерпретируемая, но очень специфичная.