In [1]:
import torch
from torch import Tensor
from transformer_lens import HookedTransformer
from transformer_lens.hook_points import HookPoint
from jaxtyping import Float, Int
import requests
import functools
from pathlib import Path
import importlib

import nest_asyncio

nest_asyncio.apply()
from n2g import NeuronModel, FeatureModel, Tokenizer
import mechint

importlib.reload(mechint)
from mechint import n2g

In [2]:
model_large = HookedTransformer.from_pretrained("gpt2-small")

Loaded pretrained model gpt2-small into HookedTransformer


In [3]:
model_large.generate("the cat is sm")

  0%|          | 0/10 [00:00<?, ?it/s]

'the cat is smwing me ♥\n\ncat swallowed girl drinkers'

In [4]:
sample = model_large.to_tokens("the cat is sm")
top_logits, top_tokens = model_large.run_with_hooks(sample)[0, -1, :].topk(k=5, dim=-1)
print(top_logits)
model_large.to_str_tokens(top_tokens)

tensor([19.7922, 19.6165, 19.4624, 18.2938, 17.9073], device='cuda:0',
       grad_fn=<TopkBackward0>)


['eared', 'itten', 'elly', 'ot', 'okin']

In [5]:
r = requests.get("https://deepdecipher.org/api/gpt2-small/neuron2graph-search?query=activating:sm")

In [6]:
layers = [[]] * 12
for index in r.json()["data"]:
    layer = index["layer"]
    neuron = index["neuron"]
    layers[layer].append(neuron)

In [7]:
hooks = []


def hook_fn(
    indices: Int[Tensor, " _"], activation: Float[Tensor, "batch context neurons_per_layer"], hook: HookPoint
) -> None:
    activation[:, -1, indices] = 0.0


for layer_index, neurons in enumerate(layers):
    indices = torch.tensor(neurons)
    hook = functools.partial(hook_fn, indices)
    hooks.append((f"blocks.{layer_index}.mlp.hook_post", hook))


top_logits, top_tokens = model_large.run_with_hooks(sample, fwd_hooks=hooks)[0, -1, :].topk(k=5, dim=-1)
print(top_logits)
model_large.to_str_tokens(top_tokens)

tensor([20.9554, 20.9068, 20.5438, 19.5182, 18.8950], device='cuda:0',
       grad_fn=<TopkBackward0>)


['eared', 'elly', 'itten', 'ot', 'okin']

In [8]:
model_large.generate("the cat is sm")

  0%|          | 0/10 [00:00<?, ?it/s]

'the cat is smothering my ear and my right ear.yes'

In [9]:
model_gelu = HookedTransformer.from_pretrained("gelu-1l")
model_gelu.generate("the cat is sm")

Loaded pretrained model gelu-1l into HookedTransformer


  0%|          | 0/10 [00:00<?, ?it/s]

'the cat is smitten by several sadities. She has not said'

In [10]:
n2g_path = Path("outputs/gelu-1l-sae-n2g")


def activates_on(n2g_model: NeuronModel, token: str | list[str]) -> bool:
    if isinstance(token, str):
        token = [token]
    activating = n2g_model.trie_root.children.keys()
    return any(t in activating for t in token)


sm_n2g_models = [model for model in n2g.iter_models(n2g_path, range(2048, 4096)) if activates_on(model, ["sm", " sm"])]

len(sm_n2g_models)
for n2g_model in sm_n2g_models:
    print(n2g_model.trie_root.children.keys())

dict_keys([',', 'and', 'user', '0', '3', '.', 'glad', 'name', 'queries', 'just', 'humans', 'suggests', 'related', 'ning', 'sm', 'iles', 'family'])
dict_keys(['system', 'arlington', 'ole', 'sm', 'dent', 'vector', 'OW', 'th', 'igator'])


In [11]:
def print_trie(node, depth: int) -> None:
    print(" " * depth + "'" + node.value.token + "'")
    for str_token, child in node.children.items():
        assert str_token == child.value.token, f"'{str_token}' != '{node.value.token}'"
        print_trie(child, depth + 1)


n2g_model = sm_n2g_models[0]
root = n2g_model.trie_root

gelu_tokenizer = Tokenizer(model_gelu)
rs_model = FeatureModel.from_model(gelu_tokenizer, n2g_model)

AssertionError: given string partly should be tokenized to exactly one token