# Activation steering with custom wrapper and gpt2-xl

This notebook shows how to extract and manipulate internal model activations using a self written wrapper module.

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
import torch

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device: {device}")

device: cuda


In [None]:
# load model
model = AutoModelForCausalLM.from_pretrained("gpt2-xl").to(device).eval()
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")

In [None]:
# define wrapper class
class WrappedModule(torch.nn.Module):
   def __init__(self, module):
        super().__init__()
        self.module = module
        self.output = None
        self.steering_vec = None
   def forward(self, *args, **kwargs):
        self.output = self.module(*args, **kwargs)
        if self.steering_vec is not None:
            # the output of the residual layer is actually a tuple, the activation is the first part of that tuple
            return (self.output[0] + self.steering_vec,) + self.output[1:] 
        else:
            return self.output

In [None]:
# wrap a module of your loaded pretrained transformer model    	
layer_id = 5 
model.transformer.h[layer_id] = WrappedModule(model.transformer.h[layer_id])

In [None]:
# get internal activations
inputs = tokenizer("Love", return_tensors="pt").to(device)
_ = model(**inputs)
act_love = model.transformer.h[layer_id].output[0]

inputs = tokenizer("Hate", return_tensors="pt").to(device)
_ = model(**inputs)
act_hate = model.transformer.h[layer_id].output[0]

print(f"act_love.shape: {act_love.shape}")
print(f"act_hate.shape: {act_hate.shape}")

act_love.shape: torch.Size([1, 1, 1600])
act_hate.shape: torch.Size([1, 2, 1600])


In [None]:
# define the steering vector
steering_vec = act_love[:,-1:,:]-act_hate[:,-1:,:]
print(f"steering_vec.shape:  {steering_vec.shape}")
print(f"length steering_vec: {steering_vec.norm():.2f}")

# reset the steering vector length to 1
steering_vec /= steering_vec.norm()

steering_vec.shape:  torch.Size([1, 1, 1600])
length steering_vec: 100.23


In [None]:
test_sentence = "I think dogs are "

# generate text while steering in positive direction
coeff = 10
model.transformer.h[layer_id].steering_vec = coeff*steering_vec
inputs = tokenizer(test_sentence, return_tensors="pt").to(device)
generated_ids = model.generate(**inputs, max_new_tokens=10, pad_token_id=tokenizer.eos_token_id, do_sample=False)
generated_text = tokenizer.batch_decode(generated_ids)
print(generated_text[0])
print("-"*20)

# generate text while steering in negative direction
coeff = -10
model.transformer.h[layer_id].steering_vec = coeff*steering_vec
inputs = tokenizer(test_sentence, return_tensors="pt").to(device)
generated_ids = model.generate(**inputs, max_new_tokens=10, pad_token_id=tokenizer.eos_token_id, do_sample=False)
generated_text = tokenizer.batch_decode(generated_ids)
print(generated_text[0])

I think dogs are  a great way to get to know someone.
--------------------
I think dogs are icky, but I don't think they're 


In [None]:
# be sure to remove the wrapping again afterwards. 
model.transformer.h[layer_id] = model.transformer.h[layer_id].module
inputs = tokenizer(test_sentence, return_tensors="pt").to(device)
generated_ids = model.generate(**inputs, max_new_tokens=10, pad_token_id=tokenizer.eos_token_id, do_sample=False)
generated_text = tokenizer.batch_decode(generated_ids)
print(generated_text[0])

I think dogs are  a great way to get your dog to learn
