The aim of this notebook is to use contrasting pairs of animal/non-animal related sentences to identify an "animal direction" in any of the layers in the residual stream of the last tokens of the sentences in GPT2. 

In [1]:
import numpy as np
import random
import pandas as pd
import torch as t
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Functions

def get_last_token_activations(sentences, model, tokenizer, device='cpu'):
    # Ensure the model and data are on the correct device
    model.to(device)
    
    activations = {}

    def get_activation(name):
        def hook(model, input, output):
            activations[name] = output[0].detach()
        return hook

    # Register hooks to each layer of the model to capture activations
    hooks = [layer.register_forward_hook(get_activation(f'Layer_{i}')) for i, layer in enumerate(model.transformer.h)]

    # Tokenize and encode the sentences, then send to specified device
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True).to(device)
    
    # Perform model forward pass and prevent gradient computations
    with t.no_grad():
        model(**inputs)
    
    # Compute the index of the last non-padded token for each sentence
    last_token_indices = (inputs['attention_mask'].sum(dim=1) - 1).tolist()
    
    # Use advanced indexing to select the last token for each sentence in each layer
    last_token_activations = {}
    for key, val in activations.items():
        # Create a tensor of batch indices
        batch_indices = t.arange(val.size(0), device=device)
        # Index into the output for the last token of each sentence
        last_token_activations[key] = val[batch_indices, last_token_indices, :]
    
    # Clean up hooks after use to prevent memory leaks
    for hook in hooks:
        hook.remove()

    return last_token_activations

def compute_animal_directions(animal_sentences, non_animal_sentences, model, layer_names, tokenizer, device='cpu'):
    """
    Computes the direction vector (difference in means of activations) for animal vs. non-animal sentences
    across specified layers in a given model.

    Parameters:
    - animal_sentences (list of str): Sentences classified as 'animal'.
    - non_animal_sentences (list of str): Sentences classified as 'non-animal'.
    - model (torch.nn.Module): Model to compute activations from.
    - tokenizer (Tokenizer): Tokenizer that is compatible with the model.
    - device (str): Device to perform computations on ('cpu' or 'cuda').

    Returns:
    - dict: A dictionary with layer names as keys and direction vectors as values.
    """
    model.to(device)

    animal_activations = get_last_token_activations(animal_sentences, model, tokenizer)

    non_animal_activations = get_last_token_activations(non_animal_sentences, model, tokenizer)


    animal_directions = {}

    for layer_name in layer_names:
        animal_layer_activations = animal_activations[layer_name].cpu().numpy()
        non_animal_layer_activations = non_animal_activations[layer_name].cpu().numpy()

        print(animal_layer_activations.shape)
        animal_directions[layer_name] = np.mean(animal_layer_activations, axis=0) - np.mean(non_animal_layer_activations, axis=0)



    return animal_directions

def project_activations(activations, detector_direction):
    """
    Project activation vectors onto the detector direction.
    
    Parameters:
    - activations (numpy.ndarray): Activation vectors to project.
    - detector_direction (numpy.ndarray): The detector direction vector.
    
    Returns:
    - numpy.ndarray: Scalar values of the projection of each activation onto the detector direction.
    """

    # Project each activation onto the normalized detector direction
    projection = np.dot(activations, detector_direction)
    return projection

In [3]:
# Load the CSV file to verify it
df_sentences = pd.read_csv('../datasets/ilikecats.csv')

# Display the first few entries
print(df_sentences.head())

# Filter the DataFrame for rows where the Label column is 'Animal'
animal_sentences = df_sentences[df_sentences['Label'] == 'Animal']['Sentence'].tolist()
non_animal_sentences = df_sentences[df_sentences['Label'] == 'Non-Animal']['Sentence'].tolist()


    Label           Sentence
0  Animal       I like cats.
1  Animal       I like dogs.
2  Animal  I like elephants.
3  Animal     I like tigers.
4  Animal      I like birds.


In [4]:
# Initialize the tokenizer and model #huggingface-cli login
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
# Set the EOS token as the padding token if it's not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
_ = model.eval()

In [5]:
# Print max number of layers and max number of tokens
print(f"Number of layers = {len(model.transformer.h)}")
max_tokens = tokenizer(df_sentences['Sentence'].tolist(), return_tensors="pt", padding=True, truncation=True)['input_ids'].shape[-1]
print(f"Number of tokens = {max_tokens}")

Number of layers = 12
Number of tokens = 10


In [6]:
# Compute animal direction for each layer
layer_names = [f'Layer_{i}' for i in range(6,7)] # Max is range(len(model.transformer.h)


# Shorten samples, create train-test split, Shuffle sentences

num_examples = 300
animal_sentences_short = animal_sentences[0:num_examples]
non_animal_sentences_short = non_animal_sentences[0:num_examples]

train_test_split = 0.8

train_animal_sentences = animal_sentences_short[0:int(num_examples*0.8)]
test_animal_sentences = animal_sentences_short[int(num_examples*0.8):]

train_non_animal_sentences = non_animal_sentences_short[0:int(num_examples*0.8)]
test_non_animal_sentences = non_animal_sentences_short[int(num_examples*0.8):]


random.shuffle(animal_sentences_short)
random.shuffle(non_animal_sentences_short)

activation_directions = compute_animal_directions(train_animal_sentences, train_non_animal_sentences, model, layer_names, tokenizer)

(240, 768)


In [7]:
test_data = test_animal_sentences + test_non_animal_sentences
labels = [1]*len(test_animal_sentences) + [0]*len(test_non_animal_sentences)
test_activations = get_last_token_activations(test_data, model, tokenizer, device='cpu')


In [8]:
results = project_activations(test_activations['Layer_6'], activation_directions['Layer_6'])

In [9]:
def calculate_accuracy(values, labels):
    """
    Calculate the accuracy of classification where positive numbers in 'values'
    should correspond to 1s in 'labels' and negative numbers to 0s.

    Parameters:
    - values (list of float): List of numerical values.
    - labels (list of int): Corresponding list of binary labels (1s and 0s).

    Returns:
    - float: The accuracy of the match-up, represented as a fraction between 0 and 1.
    """
    correct_count = 0
    total_count = len(values)

    for value, label in zip(values, labels):
        # Predict 1 if value is positive, 0 if negative
        predicted_label = 1 if value > 0 else 0
        # Check if prediction matches the label
        if predicted_label == label:
            correct_count += 1

    # Calculate accuracy
    accuracy = correct_count / total_count
    return accuracy

In [10]:
accuracy = calculate_accuracy(results, labels)
print("Accuracy:", accuracy)

Accuracy: 0.975
