# Activation steering with PyTorch hooks and gpt2-xl

This notebook shows how to extract and manipulate internal model activations using the PyTorch hooks. As we are interested in modifying the forward pass, we will be using [forward hooks](https://pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_hook.html).

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device: {device}")

device: cuda


In [4]:
# load model
model = AutoModelForCausalLM.from_pretrained("gpt2-xl").to(device).eval()
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")

In [5]:
# define a hook function that caches activations
def cache_hook(cache):
	def hook(module, input, output):
		cache.append(output[0]) # the output of the residual stream is actually a tuple, where the first entry is the activation
	return hook

In [9]:
# define layer to do the activation steering on
layer_id = 5

# get internal activations
cache = []
handle = model.transformer.h[layer_id].register_forward_hook(cache_hook(cache))
inputs = tokenizer("Love", return_tensors="pt").to(device)
_ = model(**inputs)
inputs = tokenizer("Hate", return_tensors="pt").to(device)
_ = model(**inputs)
handle.remove()  # it's very important to keep track of hook handles and remove the hooks 
act_love = cache[0]
act_hate = cache[1]

print(f"act_love.shape: {act_love.shape}")
print(f"act_hate.shape: {act_hate.shape}")

act_love.shape: torch.Size([1, 1, 1600])
act_hate.shape: torch.Size([1, 2, 1600])


In [8]:
# define the steering vector
steering_vec = act_love[:,-1:,:]-act_hate[:,-1:,:]
print(f"steering_vec.shape:  {steering_vec.shape}")
print(f"length steering_vec: {steering_vec.norm():.2f}")

# reset the steering vector length to 1
steering_vec /= steering_vec.norm()

steering_vec.shape:  torch.Size([1, 1, 1600])
length steering_vec: 100.23


In [11]:
# define the activation steering funtion
def act_add(steering_vec):
	def hook(module, input, output):
		return (output[0] + steering_vec,) + output[1:] # the output of the residual stream is actually a tuple, where the first entry is the activation
	return hook

In [12]:
test_sentence = "I think dogs are "

# generate text while steering in positive direction
coeff = 10
handle = model.transformer.h[layer_id].register_forward_hook(act_add(coeff*steering_vec))
inputs = tokenizer(test_sentence, return_tensors="pt").to(device)
generated_ids = model.generate(**inputs, max_new_tokens=10, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.batch_decode(generated_ids)
handle.remove()  # it's very important to keep track of hook handles and remove the hooks 
print(generated_text[0])
print("-"*20)

# generate text while steering in negative direction
coeff = -10
handle = model.transformer.h[layer_id].register_forward_hook(act_add(coeff*steering_vec))
inputs = tokenizer(test_sentence, return_tensors="pt").to(device)
generated_ids = model.generate(**inputs, max_new_tokens=10, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.batch_decode(generated_ids)
handle.remove()  # it's very important to keep track of hook handles and remove the hooks 
print(generated_text[0])

I think dogs are  a great way to get to know someone.
--------------------
I think dogs are icky, but I don't think they're 


In [13]:
# generate text without steering
inputs = tokenizer(test_sentence, return_tensors="pt").to(device)
generated_ids = model.generate(**inputs, max_new_tokens=10, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.batch_decode(generated_ids)
print(generated_text[0])

I think dogs are  a great way to get your dog to learn
