# Intro to how transformer lens works

This notebook shows how to access and modify internal model activations using the transformer lens library (version transformer_lens-1.8.1)

In [1]:
import torch

from transformer_lens import HookedTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [33]:
# load model
model_name = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(model_name).to(device, dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Access hidden activations

We can access hidden activations of the residual streem by passing the `output_hidden_states` keyword. But if we want other hidden states we need some way to hook into the other transformer layers.  

In [34]:
test_sentence = "The quick brown fox jumps over the lazy dog"

model.eval()
with torch.no_grad():
    inputs = tokenizer(test_sentence, return_tensors="pt").to(device)
    output = model(**inputs, output_hidden_states=True)
    print(output.keys())

odict_keys(['logits', 'past_key_values', 'hidden_states'])


In [35]:
print(f"len hidden states: {len(output.hidden_states)}")
print(f"shape of hidden states: {output.hidden_states[0].shape}")

len hidden states: 33
shape of hidden states: torch.Size([1, 12, 4096])


In [36]:
# the residual layer output of layer layer_id is
# output.hidden_states[layer_id + 1] as the first hidden state is the input embedding
layer_id = 5
hidden_states = output.hidden_states[layer_id + 1]

In [37]:
# create a hooked transformer
model = HookedTransformer.from_pretrained(model_name, hf_model=model.to("cpu"), device="cpu", fold_ln=False, center_writing_weights=False, center_unembed=False, tokenizer=tokenizer).to(device, torch.float16)

Loaded pretrained model meta-llama/Llama-2-7b-chat-hf into HookedTransformer
Moving model to device:  cuda


In [38]:
model.eval()
with torch.no_grad():
    inputs = tokenizer(test_sentence, return_tensors="pt").to(device)
    _, cache = model.run_with_cache(inputs["input_ids"])

In [39]:
# define what layer/module you want information from
cache_name = f"blocks.{layer_id}.hook_resid_post"
hidden_states_tl = cache[cache_name]

# mse between the two hidden states
print(f"MSE hidden states: {(hidden_states - hidden_states_tl).pow(2).mean():.4g}")

MSE hidden states: 5.723e-06


In [40]:
# now we can get hidden representations from any block in the transformer
print(f"Available blocks: {cache.keys()}")

Available blocks: dict_keys(['hook_embed', 'blocks.0.hook_resid_pre', 'blocks.0.ln1.hook_scale', 'blocks.0.ln1.hook_normalized', 'blocks.0.attn.hook_q', 'blocks.0.attn.hook_k', 'blocks.0.attn.hook_v', 'blocks.0.attn.hook_rot_q', 'blocks.0.attn.hook_rot_k', 'blocks.0.attn.hook_attn_scores', 'blocks.0.attn.hook_pattern', 'blocks.0.attn.hook_z', 'blocks.0.hook_attn_out', 'blocks.0.hook_resid_mid', 'blocks.0.ln2.hook_scale', 'blocks.0.ln2.hook_normalized', 'blocks.0.mlp.hook_pre', 'blocks.0.mlp.hook_pre_linear', 'blocks.0.mlp.hook_post', 'blocks.0.hook_mlp_out', 'blocks.0.hook_resid_post', 'blocks.1.hook_resid_pre', 'blocks.1.ln1.hook_scale', 'blocks.1.ln1.hook_normalized', 'blocks.1.attn.hook_q', 'blocks.1.attn.hook_k', 'blocks.1.attn.hook_v', 'blocks.1.attn.hook_rot_q', 'blocks.1.attn.hook_rot_k', 'blocks.1.attn.hook_attn_scores', 'blocks.1.attn.hook_pattern', 'blocks.1.attn.hook_z', 'blocks.1.hook_attn_out', 'blocks.1.hook_resid_mid', 'blocks.1.ln2.hook_scale', 'blocks.1.ln2.hook_normal

## Activation addition

Define a direction vector and add it to the internal model activations while generating new model output.

From the transformer lense documentation:

We do this by adding a hook function to that activation. The hook function maps current_activation_value, hook_point to new_activation_value. As the model is run, it computes that activation as normal, and then the hook function is applied to compute a replacement, and that is substituted in for the activation. The hook function can be an arbitrary Python function, so long as it returns a tensor of the correct shape.

In [57]:
max_new_tokens = 30
random_seed = 0
layer_id = 6
cache_name = f"blocks.{layer_id}.hook_resid_post"
coeff = 1
token_pos = -1

In [12]:
def get_hidden(model, sentence, cache_name):
    inputs = tokenizer(sentence, return_tensors="pt").to(device)
    _, cache = model.run_with_cache(inputs["input_ids"])
    return cache[cache_name]

In [13]:
with torch.no_grad():
    vec1 = get_hidden(model, "Love", cache_name)
    vec2 = get_hidden(model, "Hate", cache_name)

print(f"shape vec1: {vec1.shape}")
print(f"shape vec2: {vec2.shape}")

shape vec1: torch.Size([1, 2, 4096])
shape vec2: torch.Size([1, 3, 4096])


In [14]:
direction = vec1[:, token_pos] - vec2[:, token_pos]
# reshape to (batch_size, 1, hidden_size)
direction = direction.unsqueeze(1)
print(f"shape direction: {direction.shape}")

shape direction: torch.Size([1, 1, 4096])


In [61]:
# define hook function
def act_add_hook(direction):
    
    def hook_fn(activation, hook):
        output = activation + direction
        return output

    return hook_fn

In [42]:
test_sentence = "I think dogs are "

with torch.no_grad():
    inputs = tokenizer(test_sentence, return_tensors="pt").to(device)

    # this only sets the hook temporarily
    hooked_output = model.run_with_hooks(
        inputs["input_ids"],
        return_type="logits", 
        fwd_hooks=[(
            cache_name, 
            act_add_hook(direction)
            )]
    )

    normal_output = model(inputs["input_ids"])
    print(f"difference in logits: {((hooked_output - normal_output)**2).mean():.4g}")

difference in logits: 0.4545


In [58]:
model.reset_hooks()
model.add_hook(name=cache_name, hook=act_add_hook(coeff*direction))

with torch.no_grad():
    torch.random.manual_seed(random_seed)
    pos_direction_text = model.generate(test_sentence, max_new_tokens=max_new_tokens, verbose=False)
    print(f"pos_direction_text: {pos_direction_text}")

pos_direction_text: I think dogs are 🐾 Stock Photography viewer favorites, model, people often mix them up and do perform live shows and events, main acts for


In [59]:
model.reset_hooks()
model.add_hook(name=cache_name, hook=act_add_hook(-coeff*direction))

with torch.no_grad():
    torch.random.manual_seed(random_seed)
    neg_direction_text = model.generate(test_sentence, max_new_tokens=20, verbose=False)

    print(f"neg_direction_text: {neg_direction_text}")

neg_direction_text: I think dogs are icky and unpleasant vermin, and I wish I could sleep but I can't because


In [60]:
model.reset_hooks()
with torch.no_grad():
    torch.random.manual_seed(random_seed)
    original_text = model.generate(test_sentence, max_new_tokens=20, verbose=False)
    print(f"original_text: {original_text}")

original_text: I think dogs are 
2 years old when they are a puppy.

So, if a dog is 
