In [1]:
from nnsight import LanguageModel
import torch as t
import einops
from tqdm import tqdm
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import torch.nn.functional as F

from jaxtyping import Int, Float
from typing import List, Optional, Tuple

from torch import Tensor

import pandas as pd
import requests

from rich import print

import time

device = t.device("cuda" if t.cuda.is_available() else "cpu")

## Load the Data

Load in the dataset of factual prompts and answers.

In [2]:
response = requests.get("https://rome.baulab.info/data/dsets/known_1000.json")
data = response.json()

In [3]:
df = pd.DataFrame.from_dict(data)
df.head()

Unnamed: 0,known_id,subject,attribute,template,prediction,prompt,relation_id
0,0,Vinson Massif,Antarctica,{} is located in the continent,of Antarctica. It is the largest of the three,Vinson Massif is located in the continent of,P30
1,1,Beats Music,Apple,{} is owned by,"Apple, which is also the owner of Beats Elect...",Beats Music is owned by,P127
2,2,Audible.com,Amazon,{} is owned by,"Amazon.com, Inc. or its affiliates.",Audible.com is owned by,P127
3,3,The Big Bang Theory,CBS,{} premieres on,CBS on September 22.<|endoftext|>,The Big Bang Theory premieres on,P449
4,4,MacApp,Apple,"{}, a product created by",Apple to help developers create apps for the ...,"MacApp, a product created by",P178


Write a function to sample random prompts and their answers from the dataset.

In [4]:
def aggregate_prompts(frame: pd.DataFrame, number: Int, prepend_space=False):
    intermediate_frame = df.sample(n=number, random_state=2)
    intermediate_frame = intermediate_frame[["subject", "attribute", "prompt"]]

    prompts = list(intermediate_frame.prompt)
    answers = list(intermediate_frame.attribute)

    if prepend_space:
        answers = [" " + a for a in answers]

    return prompts, answers

In [5]:
# Load the model
model = LanguageModel("gpt2", device_map=device)

## Running the Dataset

Run a single prompt through the model. And observe its outputs.

In [6]:
prompts, answers = aggregate_prompts(df, 1, prepend_space=True)

with model.forward(prompts) as invoker:
    pass
logits = invoker.output[0]

In [7]:
# function to test a prompt on a model and generate a textual representaiton of outputs
def test_prompt(prompt: str, answer: str, model):
    
    with model.forward(prompt) as invoker:
        pass

    logits = invoker.output[0]
    logits = logits[0,-1,:] # only over the final token
    probs = logits.softmax(dim=-1)
    
    sorted_indices = t.argsort(probs, descending=True)
    val = model.tokenizer(answer).input_ids[0]
    rank = (sorted_indices == val[0]).nonzero(as_tuple=True)[0]
    
    print(f"[bold]Rank: {rank.item()}        Logit: {logits[val].item():.3f} Prob: {probs[val].item():.3f} Token: |{model.tokenizer.decode(val)}|[/]")
    
    for i, (tok, prob) in enumerate(zip(probs.topk(10).indices, probs.topk(10).values)):
        print(f"Top {i}th token. Logit: {logits[tok]:.3f} Prob: {prob:.3f} Token: |{model.tokenizer.decode(tok)}|")

In [8]:
# model.tokenizer(answers).input_ids[0]
test_prompt(prompts, answers, model)

### Exercise: Calculate probabilities across a batch of tokens

In [9]:
prompts, answers = aggregate_prompts(df, 200, prepend_space=True)

with model.generate(max_new_tokens=1) as generator:
    with generator.invoke(prompts) as invoker:
        out = model.lm_head.output.t[-1].save()

out = out.value

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [10]:
probs = out.softmax(dim=-1)
indices_tensor = t.tensor([model.tokenizer.encode(a) for a in answers]).to(device)
top_probs = t.gather(probs, 1, indices_tensor)
top_probs.mean()

tensor(0.0915, device='cuda:0')

Broken runs with forward pass.

In [11]:
prompts, answers = aggregate_prompts(df, 100, prepend_space=True)

print(prompts[0:5])
print(prompts[0])

with model.forward(prompts) as invoker:
    out_a = model.lm_head.output.t[-1].save()

out_a = out_a.value
out_b = invoker.output[0]

print(new_out[0].argmax())
print(out_b[0,-1,:].argmax())

with model.forward(prompts[0]) as invoker:
    out_c = model.lm_head.output.t[-1].save()

out_c = out_c.value
out_d = invoker.output[0]

print(out_c[0].argmax())
print(out_d[0,-1,:].argmax())



NameError: name 'new_out' is not defined

# Module 2 (Activation Patching)

In [11]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2AttentionAltered(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
          (query): WrapperModule()
          (key): WrapperModule()
          (value): WrapperModule()
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
clean_run = t.zeros((12, ))

In [12]:
prompts, answers = aggregate_prompts(df, 10, prepend_space=True)

with model.generate(max_new_tokens=1) as generator:
    with generator.invoke(prompts) as invoker:
        out = model.lm_head.output.t[-1].save()

out = out.value

In [14]:
out

tensor([[-0.5354, -0.6534, -3.6208,  ..., -5.4662, -6.7906,  0.6084],
        [ 1.3311,  0.7088, -4.7778,  ..., -7.7284, -8.6475,  1.0048],
        [ 0.5023, -0.4923, -3.0727,  ..., -4.8837, -5.9296,  2.4507],
        ...,
        [ 0.2967,  0.0909, -4.2231,  ..., -6.5604, -8.5986,  0.8769],
        [-0.2706,  0.0806, -3.7333,  ..., -6.6863, -9.6483,  0.6361],
        [ 0.7759,  0.3987, -4.9502,  ..., -7.4262, -8.2131,  1.0130]],
       device='cuda:0')

In [22]:
temp = list(df.subject[0:10])
[model.tokenizer.encode(t) for t in temp]

[[53, 7899, 5674, 361],
 [3856, 1381, 7849],
 [16353, 856, 13, 785],
 [464, 4403, 9801, 17003],
 [14155, 4677],
 [33704, 1904, 27768, 3905, 72],
 [39075, 11339],
 [41984, 439, 12010],
 [35882, 14485, 6869, 1158],
 [5005, 957, 26333, 5351, 19220, 2123, 6428, 19220]]

In [28]:
df.iloc[9].template

'The original language of {} is'

In [32]:
model.tokenizer.decode(23884)

' {}'

In [31]:
temp = model.tokenizer.encode(df.iloc[9].template)
print(temp)
[model.tokenizer.decode(t) for t in temp]

['The', ' original', ' language', ' of', ' {}', ' is']

In [None]:
templates = mdf

In [None]:
# split the information 

In [35]:
with model.generate(max_new_tokens=1) as generator:
    with generator.invoke(df.iloc[9].prompt):
        out = model.lm_head.output.save()

out = out.value

In [36]:
out

torch.Size([1, 17, 50257])

In [14]:
model.tokenizer.encode(" The Big Bang Theory")

[383, 4403, 9801, 17003]

In [None]:
# First subject token
# middle subject token
# last subject token
# first subsequent token
# further tokens
# last token