In [1]:
from nnsight import LanguageModel
import torch as t
import einops
from tqdm import tqdm
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import torch.nn.functional as F

from jaxtyping import Int, Float
from typing import List, Optional, Tuple

from torch import Tensor

import pandas as pd
import requests

from rich import print

import time

device = t.device("cuda" if t.cuda.is_available() else "cpu")

## Load the Data

Load in the dataset of factual prompts and answers.

In [2]:
response = requests.get("https://rome.baulab.info/data/dsets/known_1000.json")
data = response.json()

In [3]:
df = pd.DataFrame.from_dict(data)
df.head()

Unnamed: 0,known_id,subject,attribute,template,prediction,prompt,relation_id
0,0,Vinson Massif,Antarctica,{} is located in the continent,of Antarctica. It is the largest of the three,Vinson Massif is located in the continent of,P30
1,1,Beats Music,Apple,{} is owned by,"Apple, which is also the owner of Beats Elect...",Beats Music is owned by,P127
2,2,Audible.com,Amazon,{} is owned by,"Amazon.com, Inc. or its affiliates.",Audible.com is owned by,P127
3,3,The Big Bang Theory,CBS,{} premieres on,CBS on September 22.<|endoftext|>,The Big Bang Theory premieres on,P449
4,4,MacApp,Apple,"{}, a product created by",Apple to help developers create apps for the ...,"MacApp, a product created by",P178


Write a function to sample random prompts and their answers from the dataset.

In [41]:
def aggregate_prompts(frame: pd.DataFrame, number: Int, prepend_space=False):
    intermediate_frame = df.sample(n=number, random_state=10)
    intermediate_frame = intermediate_frame[["subject", "attribute", "prompt", "template"]]

    prompts = list(intermediate_frame.prompt)
    answers = list(intermediate_frame.attribute)

    if prepend_space:
        answers = [" " + a for a in answers]

    return prompts, answers, intermediate_frame

In [5]:
# Load the model
model = LanguageModel("gpt2", device_map=device)

## Running the Dataset

Run a single prompt through the model. And observe its outputs.

In [6]:
prompts, answers, _ = aggregate_prompts(df, 1, prepend_space=True)

with model.forward() as runner:
    with runner.invoke(prompts) as invoker: 
        pass
    
logits = runner.output[0]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [7]:
# function to test a prompt on a model and generate a textual representaiton of outputs
def test_prompt(prompt: str, answer: str, model):
    
    with model.forward() as runner:
        with runner.invoke(prompts) as invoker: 
            pass

    logits = runner.output[0]
    logits = logits[0,-1,:] # only over the final token
    probs = logits.softmax(dim=-1)
    
    sorted_indices = t.argsort(probs, descending=True)
    val = model.tokenizer(answer).input_ids[0]
    rank = (sorted_indices == val[0]).nonzero(as_tuple=True)[0]
    
    print(f"[bold]Rank: {rank.item()}        Logit: {logits[val].item():.3f} Prob: {probs[val].item():.3f} Token: |{model.tokenizer.decode(val)}|[/]")
    
    for i, (tok, prob) in enumerate(zip(probs.topk(10).indices, probs.topk(10).values)):
        print(f"Top {i}th token. Logit: {logits[tok]:.3f} Prob: {prob:.3f} Token: |{model.tokenizer.decode(tok)}|")

In [8]:
# model.tokenizer(answers).input_ids[0]
test_prompt(prompts, answers, model)

### Exercise: Calculate probabilities across a batch of tokens

In [9]:
prompts, answers, _ = aggregate_prompts(df, 200, prepend_space=True)

with model.generate(max_new_tokens=1) as generator:
    with generator.invoke(prompts) as invoker:
        out = model.lm_head.output.t[-1].save()

out = out.value

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [10]:
probs = out.softmax(dim=-1)
indices_tensor = t.tensor([model.tokenizer.encode(a) for a in answers]).to(device)
top_probs = t.gather(probs, 1, indices_tensor)
top_probs.mean()

tensor(0.0915, device='cuda:0')

Broken runs with forward pass.

In [11]:
prompts, answers = aggregate_prompts(df, 100, prepend_space=True)

print(prompts[0:5])
print(prompts[0])

with model.forward(prompts) as invoker:
    out_a = model.lm_head.output.t[-1].save()

out_a = out_a.value
out_b = invoker.output[0]

print(new_out[0].argmax())
print(out_b[0,-1,:].argmax())

with model.forward(prompts[0]) as invoker:
    out_c = model.lm_head.output.t[-1].save()

out_c = out_c.value
out_d = invoker.output[0]

print(out_c[0].argmax())
print(out_d[0,-1,:].argmax())



NameError: name 'new_out' is not defined

# Module 2 (Activation Patching)

In [11]:
def extract_tokens(text, tokenizer):
    # Tokenize the text
    tokens = tokenizer.encode(text)

    # Extract first, middle, and last tokens
    first_token = tokens[0]
    last_token = tokens[-1]
    
    num_tokens = len(tokens)
    middle_index = num_tokens // 2

    if num_tokens % 2 == 0:  # Even number of tokens
        middle_tokens = tokens[middle_index - 1: middle_index + 1]
    else:  # Odd number of tokens
        middle_tokens = tokens[middle_index]

    return first_token, middle_tokens, last_token

In [25]:
batched_df

Unnamed: 0,subject,attribute,prompt
649,Saint-Marcellin,France,Saint-Marcellin was created in the country of
375,Law & Order,NBC,Law & Order was released on
248,Knowledge Graph,Google,Knowledge Graph is owned by
1084,Saint Valentine,bishop,Saint Valentine holds the position of the first
315,Francis Blanche,French,Francis Blanche speaks during a news conferenc...
284,RuneScape,fantasy,The genre played by RuneScape is a
47,Grand Duchy of Finland,Helsinki,"Grand Duchy of Finland's capital,"
715,Eddy Cue,Apple,Eddy Cue is employed by
824,Co-operative Commonwealth Federation (Ontario ...,Toronto,Co-operative Commonwealth Federation (Ontario ...
745,Triple H,WWE,Triple H is employed by


In [51]:
print(df.iloc[824].subject)
print(df.iloc[824].prompt)

In [43]:
for _, line in batched_df.iterrows():
    print(line.subject)

In [54]:
import torch as t

# Assuming the `extract_tokens` function is defined as shown earlier

first = []
middle = []
last = []

for _, line in batched_df.iterrows():
    # Extract the tokens from the subject
    if (line.template[0] != "{"):
        first_token, middle_tokens, last_token = extract_tokens(" " + line.subject, model.tokenizer)
    else: 
        first_token, middle_tokens, last_token = extract_tokens(line.subject, model.tokenizer)

    # Encode the prompt
    prompt = line.prompt
    prompt_tokens = model.tokenizer.encode(prompt)
    prompt_length = len(prompt_tokens)

    if not isinstance(middle_tokens, list): 
        middle_tokens = [middle_tokens]
    
    # Find indices in the prompt and convert to negative indices
    first_index = prompt_tokens.index(first_token)
    last_index = prompt_tokens.index(last_token)
    middle_indices = [(prompt_tokens.index(token_id)) for token_id in middle_tokens]
    
    first.append([first_index])
    middle.append(middle_indices)
    last.append([last_index])

In [55]:
last

[[0], [3], [2], [3], [5], [3], [9], [4], [5], [3]]

### Save Clean Runs

In [56]:
prompts, answers, batched_df = aggregate_prompts(df, 10, prepend_space=True)

clean = []

with model.forward() as runner:
    with runner.invoke(prompts) as invoker:
        for layer in model.transformer.h:
            clean.append(layer.output[0].save())

clean = [c.value for c in clean]

In [57]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2AttentionAltered(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
          (query): WrapperModule()
          (key): WrapperModule()
          (value): WrapperModule()
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [70]:

for layer in range(12):

    # First subject token
    with model.forward() as runner:
        with runner.invoke(prompts) as invoker:

            to_patch = model.transformer.h[layer].output[0].save()

            for p in to_patch:

    # First subject token
    with model.forward() as runner:
        with runner.invoke(prompts) as invoker:

            to_patch = model.transformer.h[layer].output[0].save()

            for p in to_patch:
                
                
    break
    

In [71]:
to_patch.value.shape

torch.Size([10, 15, 768])

In [None]:
# First subject token
# middle subject token
# last subject token
# first subsequent token
# further tokens
# last token