The aim of this notebook is to use contrasting pairs of animal/non-animal related sentences to identify an "animal direction" in any of the layers in the residual stream of the last tokens of the sentences in GPT2. 

In [1]:
import numpy as np
import random
import pandas as pd
import torch as t
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
# Functions

def get_last_token_activations(sentences, model, tokenizer, device='cpu'):
    # Get activations from last token across specified layers

    activations = {}

    def get_activation(name):
        def hook(model, input, output):
            activations[name] = output[0].detach()
        return hook

    hooks = [layer.register_forward_hook(get_activation(f'Layer_{i}')) for i, layer in enumerate(model.transformer.h)]

    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True).to(device)
    with t.no_grad():
        model(**inputs)
    last_token_indices = (inputs['attention_mask'].sum(dim=1) - 1).tolist()

    # Clean up hooks after use
    for hook in hooks:
        hook.remove()

    return {key: val[:, last_token_indices, :] for key, val in activations.items()}

def compute_animal_directions(animal_sentences, non_animal_sentences, model, layer_names, tokenizer, device='cpu'):
    """
    Computes the direction vector (difference in means of activations) for animal vs. non-animal sentences
    across specified layers in a given model.

    Parameters:
    - animal_sentences (list of str): Sentences classified as 'animal'.
    - non_animal_sentences (list of str): Sentences classified as 'non-animal'.
    - model (torch.nn.Module): Model to compute activations from.
    - tokenizer (Tokenizer): Tokenizer that is compatible with the model.
    - device (str): Device to perform computations on ('cpu' or 'cuda').

    Returns:
    - dict: A dictionary with layer names as keys and direction vectors as values.
    """
    model.to(device)
    
    # activations = {}

    # def get_activation(name):
    #     def hook(model, input, output):
    #         activations[name] = output[0].detach()
    #     return hook

    # hooks = [layer.register_forward_hook(get_activation(f'Layer_{i}')) for i, layer in enumerate(model.transformer.h)]

    # def get_last_token_activations(sentences):
    #     inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True).to(device)
    #     with t.no_grad():
    #         model(**inputs)
    #     last_token_indices = (inputs['attention_mask'].sum(dim=1) - 1).tolist()
    #     return {key: val[:, last_token_indices, :] for key, val in activations.items()}

    animal_activations = get_last_token_activations(animal_sentences, model, tokenizer)

    non_animal_activations = get_last_token_activations(non_animal_sentences, model, tokenizer)


    animal_directions = {}

    for layer_name in layer_names:
        animal_layer_activations = animal_activations[layer_name].cpu().numpy()
        non_animal_layer_activations = non_animal_activations[layer_name].cpu().numpy()

        print(animal_layer_activations.shape)
        animal_directions[layer_name] = np.mean(animal_layer_activations, axis=0) - np.mean(non_animal_layer_activations, axis=0)



    return animal_directions


In [3]:
# Load the CSV file to verify it
df_sentences = pd.read_csv('../datasets/ilikecats.csv')

# Display the first few entries
print(df_sentences.head())

# Filter the DataFrame for rows where the Label column is 'Animal'
animal_sentences = df_sentences[df_sentences['Label'] == 'Animal']['Sentence'].tolist()
non_animal_sentences = df_sentences[df_sentences['Label'] == 'Non-Animal']['Sentence'].tolist()


    Label           Sentence
0  Animal       I like cats.
1  Animal       I like dogs.
2  Animal  I like elephants.
3  Animal     I like tigers.
4  Animal      I like birds.


In [4]:
# Initialize the tokenizer and model #huggingface-cli login
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
# Set the EOS token as the padding token if it's not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
_ = model.eval()

In [5]:
# Print max number of layers and max number of tokens
print(f"Number of layers = {len(model.transformer.h)}")
max_tokens = tokenizer(df_sentences['Sentence'].tolist(), return_tensors="pt", padding=True, truncation=True)['input_ids'].shape[-1]
print(f"Number of tokens = {max_tokens}")

Number of layers = 12
Number of tokens = 10


In [33]:
# Compute animal direction for each layer
layer_names = [f'Layer_{i}' for i in range(6,7)] # Max is range(len(model.transformer.h)


# Shorten samples, create train-test split, Shuffle sentences

num_examples = 100
animal_sentences_short = animal_sentences[0:num_examples]
non_animal_sentences_short = non_animal_sentences[0:num_examples]

train_test_split = 0.8

train_animal_sentences = animal_sentences_short[0:int(num_examples*0.8)]
test_animal_sentences = animal_sentences_short[int(num_examples*0.8):]

train_non_animal_sentences = non_animal_sentences_short[0:int(num_examples*0.8)]
test_non_animal_sentences = non_animal_sentences_short[int(num_examples*0.8):]


random.shuffle(animal_sentences_short)
random.shuffle(non_animal_sentences_short)

activation_directions = compute_animal_directions(train_animal_sentences, train_non_animal_sentences, model, layer_names, tokenizer)

(80, 80, 768)


In [27]:
def project_activations(activations, detector_direction):
    """
    Project activation vectors onto the detector direction.
    
    Parameters:
    - activations (numpy.ndarray): Activation vectors to project.
    - detector_direction (numpy.ndarray): The detector direction vector.
    
    Returns:
    - numpy.ndarray: Scalar values of the projection of each activation onto the detector direction.
    """

    print(activations.shape, detector_direction.shape)
    # Project each activation onto the normalized detector direction
    projection = np.dot(activations, detector_direction)
    return projection

In [9]:
test_data = test_animal_sentences + test_non_animal_sentences
labels = [1]*len(test_animal_sentences) + [0]*len(test_non_animal_sentences)
print(len(labels))

40


In [24]:
test_activations = get_last_token_activations(test_data, model, tokenizer, device='cpu')


In [30]:
print(activation_directions['Layer_6'].shape)

(80, 768)


In [28]:
project_activations(test_activations['Layer_6'], activation_directions['Layer_6'])

torch.Size([40, 40, 768]) (80, 768)


ValueError: shapes (40,40,768) and (80,768) not aligned: 768 (dim 2) != 80 (dim 0)

In [26]:
print(test_activations['Layer_6'])

tensor([[[-0.1018, -1.4912, -1.8084,  ..., -1.1475, -0.9938,  0.6507],
         [-5.1746, -1.2403,  4.2255,  ...,  4.1594, -4.9427,  4.5651],
         [-2.2485,  0.1615, -1.8502,  ...,  0.6150, -0.9088,  1.7466],
         ...,
         [-5.1746, -1.2403,  4.2255,  ...,  4.1594, -4.9427,  4.5651],
         [-5.1746, -1.2403,  4.2255,  ...,  4.1594, -4.9427,  4.5651],
         [-4.1792, -2.1417, -2.7534,  ...,  2.2946, -1.1521,  0.8790]],

        [[-1.0778, -0.7887,  0.4466,  ...,  0.7183,  0.2962, -1.2071],
         [ 0.1360, -0.7399, -2.8891,  ..., -1.7986, -1.4220,  1.0074],
         [-1.0963, -0.7834,  0.4204,  ...,  0.7153,  0.2911, -1.2054],
         ...,
         [ 0.1360, -0.7399, -2.8891,  ..., -1.7986, -1.4220,  1.0074],
         [ 0.1360, -0.7399, -2.8891,  ..., -1.7986, -1.4220,  1.0074],
         [-0.6949, -0.1579, -3.3876,  ..., -0.7588, -1.7090,  2.7816]],

        [[-1.1604, -0.6778,  0.5964,  ...,  0.4474,  0.4029, -0.5293],
         [-0.9136,  1.0094, -3.9301,  ..., -0