# ARENA 1.2: Intro to Mechinterp

Alex Semendinger

October 2024


In [1]:
print("Code is executing...")

Code is executing...


## Setup

In [2]:
import os
import sys
import torch as t
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
import numpy as np
import einops
from jaxtyping import Int, Float
import functools
from tqdm import tqdm
from IPython.display import display
from transformer_lens.hook_points import HookPoint
from transformer_lens import (
    utils,
    HookedTransformer,
    HookedTransformerConfig,
    FactoredMatrix,
    ActivationCache,
)
import circuitsvis as cv

# Make sure exercises are in the path
chapter = r"chapter1_transformer_interp"
exercises_dir = Path(f"{os.getcwd().split(chapter)[0]}/{chapter}/exercises").resolve()
section_dir = exercises_dir / "part2_intro_to_mech_interp"
if str(exercises_dir) not in sys.path: sys.path.append(str(exercises_dir))

from plotly_utils import imshow, hist, plot_comp_scores, plot_logit_attribution, plot_loss_difference
from part1_transformer_from_scratch.solutions import get_log_probs
import part2_intro_to_mech_interp.tests as tests

# Saves computation time, since we don't need it for the contents of this notebook
t.set_grad_enabled(False)

device = t.device('mps' if t.backends.mps.is_available() else 'cuda' if t.cuda.is_available() else 'cpu')
print("using device: ", device)

MAIN = __name__ == "__main__"

using device:  mps


# 1: Intro to TransformerLens

## Loading and running models

In [3]:
gpt_small: HookedTransformer = HookedTransformer.from_pretrained("gpt2-small")

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Loaded pretrained model gpt2-small into HookedTransformer


In [4]:
print("Facts about GPT-2 Small model")
print(" n_layers:", gpt_small.cfg.n_layers)
print(" n heads per layer: ", gpt_small.cfg.n_heads)
print(" maximum context window: ", gpt_small.cfg.n_ctx)

Facts about GPT-2 Small model
 n_layers: 12
 n heads per layer:  12
 maximum context window:  1024


In [5]:
gpt2_small = gpt_small
# model_description_text = '''## Loading Models

# HookedTransformer comes loaded with >40 open source GPT-style models. You can load any of them in with `HookedTransformer.from_pretrained(MODEL_NAME)`. Each model is loaded into the consistent HookedTransformer architecture, designed to be clean, consistent and interpretability-friendly.

# For this demo notebook we'll look at GPT-2 Small, an 80M parameter model. To try the model the model out, let's find the loss on this paragraph!'''

model_description_text = "The the the the the the the the the"

loss = gpt2_small(model_description_text, return_type="loss")
print("Model loss:", loss)

Model loss: tensor(2.9376, device='mps:0')


In [6]:
print(gpt2_small.to_str_tokens("gpt2"))
print(gpt2_small.to_str_tokens(["gpt2", "gpt2"]))
print(gpt2_small.to_tokens("gpt2"))
print(gpt2_small.to_string([50256, 70, 457, 17]))

['<|endoftext|>', 'g', 'pt', '2']
[['<|endoftext|>', 'g', 'pt', '2'], ['<|endoftext|>', 'g', 'pt', '2']]
tensor([[50256,    70,   457,    17]], device='mps:0')
<|endoftext|>gpt2


### Exercise: how many tokens does the model guess correctly

In [7]:
logits: Tensor = gpt2_small(model_description_text, return_type="logits")
prediction = logits.argmax(dim=-1).squeeze()[:-1]
true_tokens = gpt2_small.to_tokens(model_description_text).squeeze()[1:]
n_correct = (true_tokens == prediction).sum()
print(f'Model answered {n_correct}/{len(true_tokens)} correctly')
print(f'Correct tokens: {gpt2_small.to_string(prediction[prediction == true_tokens])}')

Model answered 5/9 correctly
Correct tokens:  the the the the the


## Caching all activations

In [8]:
#gpt2_text = "Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on taskspecific datasets."
#gpt2_text = "this is a short text"
gpt2_text = "My entrepreneurship responsibilities are straightforward: to enthusiastically provide telecommunications and cryptocurrencies to disproportionately affected environmentalists in Charlottesville"

gpt2_tokens = gpt2_small.to_tokens(gpt2_text)
gpt2_logits, gpt2_cache = gpt2_small.run_with_cache(gpt2_tokens, remove_batch_dim=True)

#print(f'gpt2_cache contains the following keys: {gpt2_cache.keys()}')

In [9]:
# note multiple ways of accessing the same item
attn_patterns_layer_0 = gpt2_cache["pattern", 0]
also_attn_patterns_layer_0 = gpt2_cache["blocks.0.attn.hook_pattern"]
t.testing.assert_close(attn_patterns_layer_0, also_attn_patterns_layer_0)

### Exercise: verify activations

In [10]:
layer0_pattern_from_cache = gpt2_cache["pattern", 0]  # attention pattern

layer0_q = gpt2_cache["q", 0]  # size (n_tokens, n_heads, d_head), which is (33, 12, 64)
layer0_k = gpt2_cache["k", 0]
n_tokens = layer0_q.shape[0]

# compute scaled dot product, mask, softmax
layer0_attn_scores_from_q_and_k = einops.einsum(layer0_q, layer0_k, 'seqQ nhead dhead, seqK nhead dhead -> nhead seqQ seqK') / gpt2_small.cfg.d_head ** 0.5
layer0_attn_scores_from_q_and_k.masked_fill_(mask=t.triu(t.ones((n_tokens, n_tokens)), diagonal=1).to(device).bool(),
                                             value=float('-inf'))
layer0_pattern_from_q_and_k = layer0_attn_scores_from_q_and_k.softmax(dim=-1)

t.testing.assert_close(layer0_pattern_from_cache, layer0_pattern_from_q_and_k)
print("Tests passed!")

Tests passed!


## Visualizing Attention Heads

In [11]:
print(type(gpt2_cache))
attention_pattern = gpt2_cache["pattern", 0]
print(attention_pattern.shape)
gpt2_str_tokens = gpt2_small.to_str_tokens(gpt2_text)

print("Layer 0 Head Attn Patterns")
display(cv.attention.attention_patterns(
    tokens = gpt2_str_tokens,
    attention = attention_pattern,
    attention_head_names = [f'LOH{i}' for i in range(gpt2_small.cfg.n_heads)]
))

<class 'transformer_lens.ActivationCache.ActivationCache'>
torch.Size([12, 19, 19])
Layer 0 Head Attn Patterns


# 2: Finding Induction Heads

## Introducing toy attention-only model

In [12]:
cfg = HookedTransformerConfig(
    d_model = 768,
    d_head = 64,
    n_heads = 12,
    n_layers = 2,
    n_ctx = 2048,
    d_vocab = 50278,
    attention_dir = 'causal',
    attn_only = 'True',
    tokenizer_name = 'EleutherAI/gpt-neox-20b',
    seed = 398,
    use_attn_result = True,
    normalization_type = None,  # default would be 'LN', which is layernorm
    positional_embedding_type = 'shortformer' # positional embedding only used for q and k, not for v? apparently makes induction heads more likely?
)

In [13]:
from huggingface_hub import hf_hub_download

REPO_ID = "callummcdougall/attn_only_2L_half"
FILENAME = "attn_only_2L_half.pth"

weights_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)

attn_only_2L_half.pth:   0%|          | 0.00/184M [00:00<?, ?B/s]

In [14]:
model = HookedTransformer(cfg)
pretrained_weights = t.load(weights_path, map_location=device, weights_only=True)
model.load_state_dict(pretrained_weights)

tokenizer_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/457k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

<All keys matched successfully>

## Exercise: visualize attention patterns

In [15]:
#text = "We think that powerful, significantly superhuman machine intelligence is more likely than not to be created this century. If current machine learning techniques were scaled up to this level, we think they would by default produce systems that are deceptive or manipulative, and that no solid plans are known for how to avoid this."
text = "hogrning. i like hogrning. we all enjoy hogrning. sometimes hogrning happens in spring. hogrning is fun."

logits, cache = model.run_with_cache(text, remove_batch_dim=True)
str_tokens = model.to_str_tokens(text)

for layer in range(model.cfg.n_layers):
    attention_pattern = cache["pattern", layer]

    print(f"Layer {layer} Head Attn Patterns")
    display(cv.attention.attention_patterns(
        tokens = str_tokens,
        attention = attention_pattern,
        attention_head_names = [f'Layer {layer}, Head {i}' for i in range(model.cfg.n_heads)]
    ))

Layer 0 Head Attn Patterns


Layer 1 Head Attn Patterns


In [16]:
logits= model(text, return_type="logits")
loss = model(text, return_type="loss")
prediction = logits.argmax(dim=-1).squeeze()[:-1]
true_tokens = model.to_tokens(text).squeeze()[1:]
n_correct = (true_tokens == prediction).sum()
print(f'Model answered {n_correct}/{len(true_tokens)} correctly')
print(f'Correct tokens: {model.to_str_tokens(prediction[prediction == true_tokens])}')

Model answered 10/31 correctly
Correct tokens: ['ogr', 'ning', 'ogr', 'ning', '.', 'ogr', 'ning', 'ogr', 'ning', '.']


## Exercise: write your own detectors

In [17]:
def current_attn_detector(cache: ActivationCache) -> list[str]:
    '''
    Returns a list e.g. ["0.2", "1.4", "1.9"] of "layer.head" which you judge to be current-token heads
    '''
    PROB_THRESHOLD = 0.3
    heads = []
    for layer in range(2):
        pattern = cache["pattern", layer]
        avg_current_token_probs = t.diagonal(pattern, dim1=1, dim2=2).mean(dim=-1)
        head_indices = t.nonzero(avg_current_token_probs > PROB_THRESHOLD)
        heads.extend([f'{layer}.{idx.item()}' for idx in head_indices])
        #heads = dict(zip(heads, (a.item() for a in avg_current_token_probs[head_indices])))
    return heads
    

def prev_attn_detector(cache: ActivationCache) -> list[str]:
    '''
    Returns a list e.g. ["0.2", "1.4", "1.9"] of "layer.head" which you judge to be prev-token heads
    '''
    PROB_THRESHOLD = 0.35
    heads = []
    for layer in range(model.cfg.n_layers):
        pattern = cache["pattern", layer]
        avg_prev_token_probs = t.diagonal(pattern, offset=-1, dim1=1, dim2=2).mean(dim=-1)
        head_indices = t.nonzero(avg_prev_token_probs > PROB_THRESHOLD)
        heads.extend([f'{layer}.{idx.item()}' for idx in head_indices])
    return heads

def first_attn_detector(cache: ActivationCache) -> list[str]:
    '''
    Returns a list e.g. ["0.2", "1.4", "1.9"] of "layer.head" which you judge to be first-token heads
    '''
    PROB_THRESHOLD = 0.7
    heads = []
    for layer in range(model.cfg.n_layers):
        pattern = cache["pattern", layer]
        avg_first_token_probs = pattern[:, :, 0].mean(dim=-1)
        head_indices = t.nonzero(avg_first_token_probs > PROB_THRESHOLD)
        heads.extend([f'{layer}.{idx.item()}' for idx in head_indices])
    return heads


print("Heads attending to current token  = ", ", ".join(current_attn_detector(cache)))
print("Heads attending to previous token = ", ", ".join(prev_attn_detector(cache)))
print("Heads attending to first token    = ", ", ".join(first_attn_detector(cache)))

Heads attending to current token  =  0.9, 0.11
Heads attending to previous token =  0.7
Heads attending to first token    =  0.3, 1.4


## Finding induction heads

In [18]:
# exercise: plot per-token loss on repeated sequence

def generate_repeated_tokens(
        model: HookedTransformer, seq_len: int, batch: int = 1
) -> Int[Tensor, "batch full_seq_len"]:
    """
    Generates a sequence of repeated random tokens.
    Output is rep_tokens: [batch, 1 + 2*seq_len]
    """
    d_vocab = model.cfg.d_vocab
    prefix = (t.ones(batch, 1) * model.tokenizer.bos_token_id).long()
    rand_tokens = t.randint(0, d_vocab, size=(batch, seq_len))
    rep_tokens = einops.repeat(rand_tokens, 'batch seq -> batch (2 seq)')
    rep_tokens = t.cat([prefix, rep_tokens], dim=1)
    return rep_tokens

print(model.to_str_tokens(generate_repeated_tokens(model, 5, 2)[1]))

def run_and_cache_model_repeated_tokens(
        model: HookedTransformer, seq_len: int, batch: int = 1
) -> tuple[Tensor, Tensor, ActivationCache]:
    """
    Generates a sequence of repeated random tokens, and runs the model on it, returning logits, tokens and cache

    Should use the `generate_repeated_tokens` function above

    Outputs are:
        rep_tokens: [batch, 1+2*seq_len]
        rep_logits: [batch, 1+2*seq_len, d_vocab]
        rep_cache: The cache of the model run on rep_tokens
    """
    tokens = generate_repeated_tokens(model, seq_len, batch).to(device)
    logits, cache = model.run_with_cache(tokens, return_type='logits')
    return tokens, logits, cache
    
seq_len = 50
batch = 1
(rep_tokens, rep_logits, rep_cache) = run_and_cache_model_repeated_tokens(model, seq_len, batch)
rep_cache.remove_batch_dim()
rep_str = model.to_str_tokens(rep_tokens)
model.reset_hooks()
log_probs = get_log_probs(rep_logits, rep_tokens).squeeze()

print(f"Performance on the first half: {log_probs[:seq_len].mean():.3f}")
print(f"Performance on the second half: {log_probs[seq_len:].mean():.3f}")

plot_loss_difference(log_probs, rep_str, seq_len)

['<|endoftext|>', ' neat', ' post', ' 99', 'cancer', 'church', ' neat', ' post', ' 99', 'cancer', 'church']
Performance on the first half: -13.876
Performance on the second half: -3.543


In [19]:
# visualize attention heads
for layer in range(model.cfg.n_layers):
    attention_pattern = rep_cache["pattern", layer]

    print(f"Layer {layer} Head Attn Patterns")
    display(cv.attention.attention_patterns(
        tokens = rep_str,
        attention = attention_pattern,
        attention_head_names = [f'Layer {layer}, Head {i}' for i in range(model.cfg.n_heads)]
    ))

Layer 0 Head Attn Patterns


Layer 1 Head Attn Patterns


In [20]:
def induction_attn_detector(cache: ActivationCache) -> list[str]:
    '''
    Returns a list e.g. ["0.2", "1.4", "1.9"] of "layer.head" which you judge to be induction heads
    '''
    PROB_THRESHOLD = 0.5
    heads = []

    seq_len = (cache["pattern", 0].shape[-1] - 1) // 2

    for layer in range(model.cfg.n_layers):
        pattern = cache["pattern", layer]
        avg_prev_token_probs = t.diagonal(pattern, offset=-seq_len+1, dim1=1, dim2=2).mean(dim=-1)
        head_indices = t.nonzero(avg_prev_token_probs > PROB_THRESHOLD)
        heads.extend([f'{layer}.{idx.item()}' for idx in head_indices])
    return heads

print("Induction heads = ", ", ".join(induction_attn_detector(rep_cache)))

Induction heads =  1.4, 1.10


# 3: Hooks in TransformerLens

Hooks let us *edit* and *intervene on* activations anywhere in the model.

TransformerLens includes a `model.run_with_hooks` function that lets you specify **hook functions** to use on activations.

A hook function takes two arguments: `activation_value` (the activation in the model, like what's stored in an `ActivationCache`) and `hook_point` (an object that gives us methods like `hook.layer()` and `hook.name` -- I guess we'll see this later). It should either return nothing (if we're e.g. just looking at the value and doing some processing) or should return a tensor of the same size as the input (if we're editing the value).

```
# Template for a hook function (this one is currently a null-op)

def hook_function(
        attn_pattern: Float[Tensor, "batch heads seqQ seqK"],
        hook: HookPoint
) -> Float[Tensor, "batch heads seqQ seqK"]:
    
    # modify attention pattern (can be in-place)
    return attn_pattern


# Template for calling `model.run_with_hooks`
loss = model.run_with_hooks(
    tokens,
    return_type = "loss",
    fwd_hooks = [
        ('blocks.1.attn.hook_pattern', hook_function)
    ]
)
```

In [21]:
# Calculate induction scores with hooks
seq_len = 50
batch = 10
rep_tokens_10 = generate_repeated_tokens(model, seq_len, batch)

# Store the induction score for each head.
induction_score_store = t.zeros((model.cfg.n_layers, model.cfg.n_heads), device=model.cfg.device)

def induction_score_hook(
    pattern: Float[Tensor, "batch head_index dest_pos source_pos"],
    hook: HookPoint,
):
    seq_len = (pattern.shape[-1] - 1) // 2
    induction_scores = einops.reduce(t.diagonal(pattern, offset=-seq_len+1, dim1=-2, dim2=-1), "batch head pos -> head", "mean")
    induction_score_store[hook.layer()] = induction_scores

pattern_hook_names_filter = lambda name: name.endswith("pattern")

# Run with hooks (this is where we write to the `induction_score_store` tensor`)
model.run_with_hooks(
    rep_tokens_10, 
    return_type=None, # For efficiency, we don't need to calculate the logits
    fwd_hooks=[(
        pattern_hook_names_filter,
        induction_score_hook
    )]
)

# Plot the induction scores for each head in each layer
imshow(
    induction_score_store, 
    labels={"x": "Head", "y": "Layer"}, 
    title="Induction Score by Head", 
    text_auto=".2f",
    width=900, height=400
)

## Exercise: finding induction heads in GPT2-small

In [22]:
def visualize_pattern_hook(
    pattern: Float[Tensor, "batch head_index dest_pos source_pos"],
    hook: HookPoint,
):
    print("Layer: ", hook.layer())
    display(
        cv.attention.attention_patterns(
            tokens=gpt2_small.to_str_tokens(rep_tokens[0]), 
            attention=pattern.mean(0)
        )
    )

In [23]:
# Visualize induction score of each head in gpt2_small

seq_len = 50
batch = 10
rep_tokens_10 = generate_repeated_tokens(gpt2_small, seq_len, batch)

induction_score_store = t.zeros((gpt2_small.cfg.n_layers, gpt2_small.cfg.n_heads), device=gpt2_small.cfg.device)

gpt2_small.run_with_hooks(
    rep_tokens_10,
    return_type = None,
    fwd_hooks = [(pattern_hook_names_filter, induction_score_hook)]
)

# Plot the induction scores for each head in each layer
imshow(
    induction_score_store, 
    labels={"x": "Head", "y": "Layer"}, 
    title="Induction Score by Head in gpt2_small", 
    text_auto=".2f",
    range_color=(-1,1),
    width=500, height=500
)

### My quick experiment: ablate positional encoding to destroy induction heads

In [24]:
def zero_ablate_hook(pattern, hook):
    return t.zeros_like(pattern)

pos_embed_filter = lambda name: "pos_embed" in name

gpt2_small.run_with_hooks(
    rep_tokens_10,
    return_type=None,
    fwd_hooks = [
        (pattern_hook_names_filter, induction_score_hook),
        (pos_embed_filter, zero_ablate_hook)
    ]
)

imshow(
    induction_score_store, 
    labels={"x": "Head", "y": "Layer"}, 
    title="Induction Scores after zero-ablating positional encoding",
    range_color=(-1,1),
    text_auto=".2f",
    width=500, height=500
)

In [25]:
# # Look at attention patterns of heads with high induction scores

# # hacky not great way to do this filter
# suspected_induction_heads_filter = lambda name: name.endswith("pattern") and name[7] in {"5", "6", "7"}

# gpt2_small.run_with_hooks(
#     rep_tokens_10,
#     return_type = None,
#     fwd_hooks = [(suspected_induction_heads_filter, visualize_pattern_hook)]
# )

## Building interpretability tools

In [26]:
# logit attribution tool

def logit_attribution(
        embed: Float[Tensor, "seq d_model"],
        l1_results: Float[Tensor, "seq nheads d_model"],
        l2_results: Float[Tensor, "seq nheads d_model"],
        W_U: Float[Tensor, "d_model d_vocab"],
        tokens: Int[Tensor, "seq-1 n_components"]
) -> Float[Tensor, "seq-1 n_components"]:
    '''
    Inputs:
        embed: the embeddings of the tokens (i.e. token + position embeddings)
        l1_results: the outputs of the attention heads at layer 1 (with head as one of the dimensions)
        l2_results: the outputs of the attention heads at layer 2 (with head as one of the dimensions)
        W_U: the unembedding matrix
        tokens: the token ids of the sequence

    Returns:
        Tensor of shape (seq_len-1, n_components)
        represents the concatenation (along dim=-1) of logit attributions from:
            the direct path (seq-1,1)
            layer 0 logits (seq-1, n_heads)
            layer 1 logits (seq-1, n_heads)
        so n_components = 1 + 2*n_heads
    '''
    W_U_correct_tokens = W_U[:, tokens[1:]]  # (d_model, seq-1, n_components)
    direct = einops.einsum(embed[:-1], W_U_correct_tokens, "seq d_model, d_model seq -> seq").unsqueeze(-1)
    layer0 = einops.einsum(l1_results[:-1], W_U_correct_tokens, "seq nheads d_model, d_model seq -> seq nheads")
    layer1 = einops.einsum(l2_results[:-1], W_U_correct_tokens, "seq nheads d_model, d_model seq -> seq nheads")
    out = t.cat([direct, layer0, layer1], dim=-1)
    return out


#text = "this is a very short text"
text = "We think that powerful, significantly superhuman machine intelligence is more likely than not to be created this century. If current machine learning techniques were scaled up to this level, we think they would by default produce systems that are deceptive or manipulative, and that no solid plans are known for how to avoid this."
logits, cache = model.run_with_cache(text, remove_batch_dim=True)
str_tokens = model.to_str_tokens(text)
tokens = model.to_tokens(text)

with t.inference_mode():
    embed = cache["embed"]
    l1_results = cache["result", 0]
    l2_results = cache["result", 1]
    logit_attr = logit_attribution(embed, l1_results, l2_results, model.W_U, tokens[0])
    # Uses fancy indexing to get a len(tokens[0])-1 length tensor, where the kth entry is the predicted logit for the correct k+1th token
    correct_token_logits = logits[0, t.arange(len(tokens[0]) - 1), tokens[0, 1:]]
    t.testing.assert_close(logit_attr.sum(1), correct_token_logits, atol=1e-3, rtol=0)
    print("Tests passed!")

Tests passed!


In [27]:
embed = cache["embed"]
l1_results = cache["result", 0]
l2_results = cache["result", 1]
logit_attr = logit_attribution(embed, l1_results, l2_results, model.W_U, tokens[0])

plot_logit_attribution(model, logit_attr, tokens)

In [28]:
seq_len = 50

embed = rep_cache["embed"]
l1_results = rep_cache["result", 0]
l2_results = rep_cache["result", 1]
first_half_tokens = rep_tokens[0, : 1 + seq_len]
second_half_tokens = rep_tokens[0, seq_len : ]

first_half_logit_attr = logit_attribution(embed[: seq_len + 1], l1_results[: seq_len + 1], l2_results[: seq_len + 1], model.W_U, first_half_tokens)
second_half_logit_attr = logit_attribution(embed[seq_len : ], l1_results[seq_len : ], l2_results[seq_len : ], model.W_U, second_half_tokens)
assert first_half_logit_attr.shape == (seq_len, 2 * model.cfg.n_heads + 1)
assert second_half_logit_attr.shape == (seq_len, 2 * model.cfg.n_heads + 1)

plot_logit_attribution(model, first_half_logit_attr, first_half_tokens, "Logit attr (first half)")
plot_logit_attribution(model, second_half_logit_attr, second_half_tokens, "Logit attr (second half)")

## Hooks: Intervening on Activations

In [29]:
def head_ablation_hook(
    v: Float[Tensor, "batch seq n_heads d_head"],
    hook: HookPoint,
    head_index_to_ablate: int
) -> Float[Tensor, "batch seq n_heads d_head"]:
    v[:, :, head_index_to_ablate, :] = 0.
    return v


def cross_entropy_loss(logits, tokens):
    '''
    Computes the mean cross entropy between logits (the model's prediction) and tokens (the true values).

    (optional, you can just use return_type="loss" instead.)
    '''
    log_probs = F.log_softmax(logits, dim=-1)
    pred_log_probs = t.gather(log_probs[:, :-1], -1, tokens[:, 1:, None])[..., 0]
    return -pred_log_probs.mean()


def get_ablation_scores(
    model: HookedTransformer, 
    tokens: Int[Tensor, "batch seq"]
) -> Float[Tensor, "n_layers n_heads"]:
    '''
    Returns a tensor of shape (n_layers, n_heads) containing the increase in cross entropy loss from ablating the output of each head.
    '''
    # Initialize an object to store the ablation scores
    ablation_scores = t.zeros((model.cfg.n_layers, model.cfg.n_heads), device=model.cfg.device)

    # Calculating loss without any ablation, to act as a baseline
    model.reset_hooks()
    logits = model(tokens, return_type="logits")
    seq_len = (tokens.shape[1] - 1) // 2
    loss_no_ablation = cross_entropy_loss(logits[:, -seq_len:], tokens[:, -seq_len:])

    for layer in tqdm(range(model.cfg.n_layers)):
        for head in range(model.cfg.n_heads):
            # Use functools.partial to create a temporary hook function with the head number fixed
            temp_hook_fn = functools.partial(head_ablation_hook, head_index_to_ablate=head)
            # Run the model with the ablation hook
            ablated_logits = model.run_with_hooks(tokens, fwd_hooks=[
                (utils.get_act_name("v", layer), temp_hook_fn)
            ])
            # Calculate the loss difference
            loss = cross_entropy_loss(ablated_logits[:, -seq_len:], tokens[:, -seq_len:])
            # Store the result, subtracting the clean loss so that a value of zero means no change in loss
            ablation_scores[layer, head] = loss - loss_no_ablation

    return ablation_scores


ablation_scores = get_ablation_scores(model, rep_tokens)
tests.test_get_ablation_scores(ablation_scores, model, rep_tokens)

imshow(
    ablation_scores, 
    labels={"x": "Head", "y": "Layer", "color": "Loss diff"},
    title="Loss Difference After Ablating Heads", 
    text_auto=".2f",
    width=900, height=400
)

100%|██████████| 2/2 [00:00<00:00,  7.09it/s]
100%|██████████| 2/2 [00:00<00:00,  6.54it/s]


All tests in `test_get_ablation_scores` passed!


## Bonus: different ablations

### Ablating everything except induction capability

In [30]:
def everything_but_prev_token_and_induction(name):
    if utils.get_act_name("v") in name:
        if '0' in name:
            for head in range(model.cfg.n_heads):
                if head != 7:
                    return True
        if '1' in name:
            for head in range(model.cfg.n_heads):
                if head not in {4, 10}:
                    return True

very_ablated_logits = model.run_with_hooks(tokens, fwd_hooks=[(everything_but_prev_token_and_induction, zero_ablate_hook)])

In [31]:
rep_cache.keys()

dict_keys(['hook_embed', 'hook_pos_embed', 'blocks.0.hook_resid_pre', 'blocks.0.attn.hook_q', 'blocks.0.attn.hook_k', 'blocks.0.attn.hook_v', 'blocks.0.attn.hook_attn_scores', 'blocks.0.attn.hook_pattern', 'blocks.0.attn.hook_z', 'blocks.0.attn.hook_result', 'blocks.0.hook_attn_out', 'blocks.0.hook_resid_post', 'blocks.1.hook_resid_pre', 'blocks.1.attn.hook_q', 'blocks.1.attn.hook_k', 'blocks.1.attn.hook_v', 'blocks.1.attn.hook_attn_scores', 'blocks.1.attn.hook_pattern', 'blocks.1.attn.hook_z', 'blocks.1.attn.hook_result', 'blocks.1.hook_attn_out', 'blocks.1.hook_resid_post'])

In [32]:
utils.get_act_name("v")

'attn.hook_v'

# 4: Reverse-Engineering induction circuits

Returning to this on October 7, 2024. Finished section 3 on September 11.

## Refresher on induction circuits

### Questions: What is the interpretation of each of the following matrices?

Describe (a) the *type of input* it takes, and (b) what the *output represents*.

1. $W^h_{OV}$

    a. Size: $d_{\text{model}} \times d_{\text{model}}$

    b. Input: $x$ in residual stream

    b. Output: $x^\top W^h_{OV}$ is the vector written to the residual stream at the **destination position**, if the destination token only pays attention to the source token at the position of $x$.

2. $W_E W^h_{OV} W_U$

    a. Size: $d_{voc} \times d_{voc}$

    b. Input: token $t$

    c. Output: logit $t^\top W_E W_{OV} W_U$. This is how much the token $t$ is upweighted at the destination position?

3. $W_{QK}^h$

    a. Size:  $d_{\text{model}} \times d_{\text{model}}$

    b. Input: $x$

    c. Output: $x_i ^\top W_Q W_K^\top x_j$ is the attention score paid by token $i$ to token $j$

4. $W_E W_{QK} W_E^\top$ (and similarly $W_{\text{pos}}$)

    a. size: $d_{voc} \times d_{voc}$ (or $n_{ctx} \times n_{ctx}$)

    b. input: token

    c. output: $s^\top [ - ] t$ is how much attention $t$ pays to $s$ -- tokens if $W_E$, positions in context if $W_{pos}$.

5. $W_E W_{OV}^{h_1} W_{QK}^{h_2} W_E^\top$

    a. size: $d_{voc} \times d_{voc}$

    b. input: token

    c. output: $s^\top [ - ] t$ 

#### TODO: internalize what the heck this last one is doing, hold the whole thing in my head**

## Setup for section 4, so you don't need to run all of the others

In [33]:
cfg = HookedTransformerConfig(
    d_model = 768,
    d_head = 64,
    n_heads = 12,
    n_layers = 2,
    n_ctx = 2048,
    d_vocab = 50278,
    attention_dir = 'causal',
    attn_only = 'True',
    tokenizer_name = 'EleutherAI/gpt-neox-20b',
    seed = 398,
    use_attn_result = True,
    normalization_type = None,  # default would be 'LN', which is layernorm
    positional_embedding_type = 'shortformer' # positional embedding only used for q and k, not for v? apparently makes induction heads more likely?
)

from huggingface_hub import hf_hub_download

REPO_ID = "callummcdougall/attn_only_2L_half"
FILENAME = "attn_only_2L_half.pth"

weights_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)

model = HookedTransformer(cfg)
pretrained_weights = t.load(weights_path, map_location=device, weights_only=True)
model.load_state_dict(pretrained_weights)

<All keys matched successfully>

In [34]:
def generate_repeated_tokens(
        model: HookedTransformer, seq_len: int, batch: int = 1
) -> Int[Tensor, "batch full_seq_len"]:
    """
    Generates a sequence of repeated random tokens.
    Output is rep_tokens: [batch, 1 + 2*seq_len]
    """
    d_vocab = model.cfg.d_vocab
    prefix = (t.ones(batch, 1) * model.tokenizer.bos_token_id).long()
    rand_tokens = t.randint(0, d_vocab, size=(batch, seq_len))
    rep_tokens = einops.repeat(rand_tokens, 'batch seq -> batch (2 seq)')
    rep_tokens = t.cat([prefix, rep_tokens], dim=1)
    return rep_tokens

def run_and_cache_model_repeated_tokens(
        model: HookedTransformer, seq_len: int, batch: int = 1
) -> tuple[Tensor, Tensor, ActivationCache]:
    """
    Generates a sequence of repeated random tokens, and runs the model on it, returning logits, tokens and cache

    Should use the `generate_repeated_tokens` function above

    Outputs are:
        rep_tokens: [batch, 1+2*seq_len]
        rep_logits: [batch, 1+2*seq_len, d_vocab]
        rep_cache: The cache of the model run on rep_tokens
    """
    tokens = generate_repeated_tokens(model, seq_len, batch).to(device)
    logits, cache = model.run_with_cache(tokens, return_type='logits')
    return tokens, logits, cache
    
seq_len = 50
batch = 1
(rep_tokens, rep_logits, rep_cache) = run_and_cache_model_repeated_tokens(model, seq_len, batch)
rep_cache.remove_batch_dim()
rep_str = model.to_str_tokens(rep_tokens)
model.reset_hooks()

## OV copying circuit

### Exercise: compute OV circuit for `1.4`

In [35]:
layer = 1
head_index = 4

full_OV_circuit = FactoredMatrix(model.W_E @ model.W_V[layer, head_index] , model.W_O[layer, head_index] @ model.W_U )
tests.test_full_OV_circuit(full_OV_circuit, model, layer, head_index)

indices = t.randint(full_OV_circuit.A.shape[0], size=(200,))
full_OV_circuit_sample = full_OV_circuit[indices, indices].AB

imshow(
    full_OV_circuit_sample,
    labels={"x": "Input token", "y": "Logits on output token"},
    title="Full OV circuit for copying head",
    height=800, width=700,
)

All tests in `test_full_OV_circuit` passed!


In [36]:
def top_1_acc(full_OV_circuit: FactoredMatrix, batch_size: int = 1000) -> float:
    '''
    This should take the argmax of each column (ie over dim=0) and return the fraction of the time that's equal to the correct logit
    '''
    total = 0
    for indices in t.split(t.arange(full_OV_circuit.shape[0]), batch_size):
        AB_slice = full_OV_circuit[indices].AB
        total += (t.argmax(AB_slice, dim=1) == indices.to(device)).float().sum().item()
    return total / full_OV_circuit.shape[0]

print(f"Fraction of the time that the best logit is on the diagonal: {top_1_acc(full_OV_circuit):.4f}")

Fraction of the time that the best logit is on the diagonal: 0.3079


In [37]:
cat_W_V = t.cat([model.W_V[1, 4], model.W_V[1, 10]], dim=1)
cat_W_O = t.cat([model.W_O[1, 4], model.W_O[1, 10]], dim=0)

effective_OV_circuit = FactoredMatrix(model.W_E @ cat_W_V , cat_W_O @ model.W_U)

print(f"Fraction of the time that the best logit is on the diagonal: {top_1_acc(effective_OV_circuit):.4f}")

Fraction of the time that the best logit is on the diagonal: 0.9556


## QK prev-token circuit

In [38]:
def mask_scores(attn_scores: Float[Tensor, "query_nctx key_nctx"]):
    '''Mask the attention scores so that tokens don't attend to previous tokens'''
    assert attn_scores.shape == (model.cfg.n_ctx, model.cfg.n_ctx)
    mask = t.tril(t.ones_like(attn_scores)).bool()
    neg_inf = t.tensor(-1.0e6).to(attn_scores.device)
    masked_attn_scores = t.where(mask, attn_scores, neg_inf)
    return masked_attn_scores


layer = 0
head = 7

W_pos = model.W_pos
W_QK = model.W_Q[layer, head_index] @ model.W_K[layer, head_index].T
pos_by_pos_scores = W_pos @ W_QK @ W_pos.T
masked_scaled = mask_scores(pos_by_pos_scores / model.cfg.d_head ** 0.5)
pos_by_pos_pattern = t.softmax(masked_scaled, dim=-1)
tests.test_pos_by_pos_pattern(pos_by_pos_pattern, model, layer, head_index)

All tests in `test_full_OV_circuit` passed!


In [39]:
print(f"Avg first subdiagonal diagonal value: {pos_by_pos_pattern.diag(-1).mean():.4f}")

imshow(
    utils.to_numpy(pos_by_pos_pattern[:100, :100]), 
    labels={"x": "Key", "y": "Query"}, 
    title="Attention patterns for prev-token QK circuit, first 100 indices",
    width=700
)

Avg first subdiagonal diagonal value: 0.2197


## K-composition circuit

In [40]:
for key in rep_cache.keys():
    print(list(rep_cache[key].shape),"\t", key)

[101, 768] 	 hook_embed
[101, 768] 	 hook_pos_embed
[101, 768] 	 blocks.0.hook_resid_pre
[101, 12, 64] 	 blocks.0.attn.hook_q
[101, 12, 64] 	 blocks.0.attn.hook_k
[101, 12, 64] 	 blocks.0.attn.hook_v
[12, 101, 101] 	 blocks.0.attn.hook_attn_scores
[12, 101, 101] 	 blocks.0.attn.hook_pattern
[101, 12, 64] 	 blocks.0.attn.hook_z
[101, 12, 768] 	 blocks.0.attn.hook_result
[101, 768] 	 blocks.0.hook_attn_out
[101, 768] 	 blocks.0.hook_resid_post
[101, 768] 	 blocks.1.hook_resid_pre
[101, 12, 64] 	 blocks.1.attn.hook_q
[101, 12, 64] 	 blocks.1.attn.hook_k
[101, 12, 64] 	 blocks.1.attn.hook_v
[12, 101, 101] 	 blocks.1.attn.hook_attn_scores
[12, 101, 101] 	 blocks.1.attn.hook_pattern
[101, 12, 64] 	 blocks.1.attn.hook_z
[101, 12, 768] 	 blocks.1.attn.hook_result
[101, 768] 	 blocks.1.hook_attn_out
[101, 768] 	 blocks.1.hook_resid_post


In [42]:
def decompose_qk_input(cache: ActivationCache) -> t.Tensor:
    '''
    Output is decomposed_qk_input, with shape [2+num_heads, seq, d_model]
                                (in this case [14, 101, 768])
    The [i, :, :]th element is y_i (from notation above)
    '''
    embed = cache["embed"]           # (seq, d_model)
    pos_embed = cache["pos_embed"]   # (seq, d_model)
    l0_results = cache["result", 0]  # (seq, heads, d_model)

    embed = einops.rearrange(embed, "seq dmodel -> seq 1 dmodel")
    pos_embed = einops.rearrange(pos_embed, "seq dmodel -> seq 1 dmodel")
    decomposed_qk_input = t.cat([embed, pos_embed, l0_results], dim=1)
                         
    return einops.rearrange(decomposed_qk_input, "seq heads dmodel -> heads seq dmodel")
    

def decompose_q(
    decomposed_qk_input: Float[Tensor, "n_heads+2 posn d_head"],
    ind_head_index: int,
    model: HookedTransformer,
) -> Tensor:
    '''
    Output is decomposed_q with shape [2+num_heads, position, d_head]

    The [i, :, :]th element is y_i @ W_Q (so the sum along axis 0 is just the q-values)
    '''
    W_Q = model.W_Q[1, ind_head_index]
    return einops.einsum(decomposed_qk_input, W_Q, "head seq dmodel, dmodel dhead -> head seq dhead")


def decompose_k(
    decomposed_qk_input: Float[Tensor, "n_heads+2 posn d_head"],
    ind_head_index: int,
    model: HookedTransformer,
) -> Tensor:
    '''
    Output is decomposed_k with shape [2+num_heads, position, d_head]

    The [i, :, :]th element is y_i @ W_K (so the sum along axis 0 is just the k-values)
    '''
    W_K = model.W_K[1, ind_head_index]
    return einops.einsum(decomposed_qk_input, W_K, "head seq dmodel, dmodel dhead -> head seq dhead")

ind_head_index = 4
# First we get decomposed q and k input, and check they're what we expect
decomposed_qk_input = decompose_qk_input(rep_cache)
decomposed_q = decompose_q(decomposed_qk_input, ind_head_index, model)
decomposed_k = decompose_k(decomposed_qk_input, ind_head_index, model)
t.testing.assert_close(decomposed_qk_input.sum(0), rep_cache["resid_pre", 1] + rep_cache["pos_embed"], rtol=0.01, atol=1e-05)
t.testing.assert_close(decomposed_q.sum(0), rep_cache["q", 1][:, ind_head_index], rtol=0.01, atol=0.001)
t.testing.assert_close(decomposed_k.sum(0), rep_cache["k", 1][:, ind_head_index], rtol=0.01, atol=0.01)
# Second, we plot our results
component_labels = ["Embed", "PosEmbed"] + [f"0.{h}" for h in range(model.cfg.n_heads)]
for decomposed_input, name in [(decomposed_q, "query"), (decomposed_k, "key")]:
    imshow(
        utils.to_numpy(decomposed_input.pow(2).sum([-1])), 
        labels={"x": "Position", "y": "Component"},
        title=f"Norms of components of {name}", 
        y=component_labels,
        width=1000, height=400
    )

In [43]:
def decompose_attn_scores(decomposed_q: t.Tensor, decomposed_k: t.Tensor) -> t.Tensor:
    '''
    Output is decomposed_scores with shape [query_component, key_component, query_pos, key_pos]

    The [i, j, :, :]th element is y_i @ W_QK @ y_j^T (so the sum along both first axes are the attention scores)
    '''
    d_head = 12
    attn = einops.einsum(decomposed_q, decomposed_k, "nQ seqQ dhead, nK seqK dhead -> nQ nK seqQ seqK")
    return attn #/ d_head ** 0.5

tests.test_decompose_attn_scores(decompose_attn_scores, decomposed_q, decomposed_k)

All tests in `test_decompose_attn_scores` passed!


In [44]:
decomposed_scores = decompose_attn_scores(decomposed_q, decomposed_k)
decomposed_stds = einops.reduce(
    decomposed_scores, 
    "query_decomp key_decomp query_pos key_pos -> query_decomp key_decomp", 
    t.std
)

# First plot: attention score contribution from (query_component, key_component) = (Embed, L0H7)
imshow(
    utils.to_numpy(t.tril(decomposed_scores[0, 9])), 
    title="Attention score contributions from (query, key) = (embed, output of L0H7)",
    width=800
)

# Second plot: std dev over query and key positions, shown by component
imshow(
    utils.to_numpy(decomposed_stds), 
    labels={"x": "Key Component", "y": "Query Component"},
    title="Standard deviations of attention score contributions (by key and query component)", 
    x=component_labels, 
    y=component_labels,
    width=800
)

### Find full K-composition circuit

In [45]:
def find_K_comp_full_circuit(
    model: HookedTransformer,
    prev_token_head_index: int,
    ind_head_index: int
) -> FactoredMatrix:
    '''
    Returns a (vocab, vocab)-size FactoredMatrix, with the first dimension being the query side 
    and the second dimension being the key side (going via the previous token head)
    '''
    W_E = model.W_E
    W_QK = FactoredMatrix(model.W_Q[1, ind_head_index], model.W_K[1, ind_head_index].T)
    W_OV = FactoredMatrix(model.W_V[0, prev_token_head_index], model.W_O[0, prev_token_head_index])
    return W_E @ W_QK @ W_OV.T @ W_E.T


prev_token_head_index = 7
ind_head_index = 4
K_comp_circuit = find_K_comp_full_circuit(model, prev_token_head_index, ind_head_index)

tests.test_find_K_comp_full_circuit(find_K_comp_full_circuit, model)

print(f"Fraction of tokens where the highest activating key is the same token: {top_1_acc(K_comp_circuit.T):.4f}")

All tests in `test_find_K_comp_full_circuit` passed!
Fraction of tokens where the highest activating key is the same token: 0.5201


## Further exploration of induction circuits

### Composition scores

In [48]:
def get_comp_score(
        W_A: Float[Tensor, "inA outA"],
        W_B: Float[Tensor, "outA outB"]
) -> float:
    """Return the composition score between W_A and W_B"""
    product = W_A @ W_B
    score = t.norm(product) / (t.norm(W_A) * t.norm(W_B))
    return score.item()

tests.test_get_comp_score(get_comp_score)

All tests in `test_get_comp_score` passed!


In [55]:
# Get all QK and OV matrices
W_QK = model.W_Q @ model.W_K.transpose(-1, -2)
W_OV = model.W_V @ model.W_O

# define tensors to hold composition scores
composition_scores = {
    "Q": t.zeros(model.cfg.n_heads, model.cfg.n_heads).to(device),
    "K": t.zeros(model.cfg.n_heads, model.cfg.n_heads).to(device),
    "V": t.zeros(model.cfg.n_heads, model.cfg.n_heads).to(device),
}

for headA in range(model.cfg.n_heads):
    for headB in range(model.cfg.n_heads):
        W_A = W_OV[0, headA]
        composition_scores["Q"][headA, headB] = get_comp_score(W_A, W_QK[1, headB])
        composition_scores["K"][headA, headB] = get_comp_score(W_A, W_QK[1, headB].transpose(-2, -1))
        composition_scores["V"][headA, headB] = get_comp_score(W_A, W_OV[1, headB])

for comp_type in "QKV":
    plot_comp_scores(model, composition_scores[comp_type], f"{comp_type} Composition Scores")

In [64]:
def generate_single_random_comp_score() -> float:
    W = t.empty(4, model.cfg.d_head, model.cfg.d_model)
    nn.init.kaiming_uniform_(W, a=np.sqrt(5))
    return get_comp_score(W[0].T @ W[1], W[2].T @ W[3])

n_samples = 300
comp_scores_baseline = np.zeros(n_samples)
for i in tqdm(range(n_samples)):
    comp_scores_baseline[i] = generate_single_random_comp_score()
print("\nMean:", comp_scores_baseline.mean())
print("Std:", comp_scores_baseline.std())
hist(
    comp_scores_baseline, 
    nbins=50, 
    width=800, 
    labels={"x": "Composition score"}, 
    title="Random composition scores"
)

100%|██████████| 300/300 [00:00<00:00, 365.62it/s]


Mean: 0.036093491601447265
Std: 0.0004245088967912471





In [65]:
baseline = comp_scores_baseline.mean()
for comp_type, comp_scores in composition_scores.items():
    plot_comp_scores(model, comp_scores, f"{comp_type} Composition Scores", baseline=baseline)

# Further exercises

## My idea: does this 2-layer transformer do "fuzzy induction"?

Fuzzy induction is an idea from "In-Context Learning and Induction Heads" where you get `[A] [B] ... [A*] [B*]`, where `*` denotes some kind of linguistic similarity.

Experiment:
1. Assemble a collection of synonym or near-synonym pairs -- ideally these are all words that are a single token, for the cleanest version
2. Create a random sequence of words, followed by a "repeated sequence" of their synonyms.
3. Run all of the above induction head experiments and see what happens!

To do step 1, maybe feed the entire list of tokens into an LLM and ask it for pairs that fit the bill?

Or pre-process by going through the token list and taking only dictionary words first

#### Creating word list

In [97]:
# get tokens, remove initial spaces and then remove duplicates
all_tokens = model.tokenizer.convert_ids_to_tokens(range(model.cfg.d_vocab))
for i, token in enumerate(all_tokens):
    if token and token[0] == "Ġ":
        all_tokens[i] = token[1:]
all_tokens_set = set(all_tokens)  # don't need this as a list currently

# load word list
with open('/Users/Alex/Desktop/large.txt', 'r') as f:
    word_set = set(f.read().splitlines())
print(f"loaded list of {len(word_set)} English words")

# take intersection
word_tokens_set = all_tokens_set.intersection(word_set)
word_tokens = list(word_tokens_set)
print(f"List of {len(word_tokens)} word tokens created.")

# # write to file (don't need to do this again)
# with open('/Users/Alex/Desktop/word_tokens.txt', 'w') as f:
#     f.write("\n".join(word_tokens))

loaded list of 181107 English words
List of 16164 word tokens created.


#### Turning Claude output into synonym pairs

In [99]:
# Asked Claude 3.5 Sonnet to generate synonym pairs
with open('/Users/Alex/Desktop/synonym_pairs.txt') as f:
    synonym_pair_strings = f.read().splitlines()
print(synonym_pair_strings[:10])

['accidental, unintended', 'accurate, precise', 'add, include', 'advance, progress', 'afraid, scared', 'allow, permit', 'already, previously', 'also, too', 'always, constantly', 'ancient, old']


In [104]:
synonym_pairs = []
for word_pair in synonym_pair_strings:
    word1, word2 = word_pair.split(',')
    word2 = word2[1:]  # remove leading space
    if word1 in word_tokens_set and word2 in word_tokens_set:
        synonym_pairs.append( (word1, word2) )
print(f"List of {len(synonym_pairs)} pairs, starting with: ", synonym_pairs[:10])

List of 848 pairs, starting with:  [('accidental', 'unintended'), ('accurate', 'precise'), ('add', 'include'), ('advance', 'progress'), ('afraid', 'scared'), ('allow', 'permit'), ('already', 'previously'), ('also', 'too'), ('always', 'constantly'), ('ancient', 'old')]
