Only keep (not ablate) the query activations (query vector, the output of query weights times inputs) of certain positions. Keep all the key activations; the query positions that were kept will automatically attend to relevant key positions by matrix multiplication.

# Setup
(No need to change anything)

In [1]:
# Janky code to do different setup when run in a Colab notebook vs VSCode
DEBUG_MODE = False
try:
    import google.colab
    IN_COLAB = True
    print("Running as a Colab notebook")
    %pip install git+https://github.com/neelnanda-io/TransformerLens.git
    # Install another version of node that makes PySvelte work way faster
    !curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash -; sudo apt-get install -y nodejs
    %pip install git+https://github.com/neelnanda-io/PySvelte.git
except:
    IN_COLAB = False
    print("Running as a Jupyter notebook - intended for development only!")
    from IPython import get_ipython

    ipython = get_ipython()
    # Code to automatically update the HookedTransformer code as its edited without restarting the kernel
    ipython.magic("load_ext autoreload")
    ipython.magic("autoreload 2")

Running as a Colab notebook
Collecting git+https://github.com/neelnanda-io/TransformerLens.git
  Cloning https://github.com/neelnanda-io/TransformerLens.git to /tmp/pip-req-build-uznuk90i
  Running command git clone --filter=blob:none --quiet https://github.com/neelnanda-io/TransformerLens.git /tmp/pip-req-build-uznuk90i
  Resolved https://github.com/neelnanda-io/TransformerLens.git to commit 174209ea708fe3838ccf08b70f2f4f28e7397cb4
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting accelerate>=0.23.0 (from transformer-lens==0.0.0)
  Downloading accelerate-0.24.0-py3-none-any.whl (260 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting beartype<0.15.0,>=0.14.1 (from transformer-lens==0.0.0)
  Downloading beartype-0.14.1-py3-none-any.whl (739 kB)
[2K     [90m━━━━━━━━

In [2]:
# Plotly needs a different renderer for VSCode/Notebooks vs Colab argh
import plotly.io as pio

if IN_COLAB or not DEBUG_MODE:
    # Thanks to annoying rendering issues, Plotly graphics will either show up in colab OR Vscode depending on the renderer - this is bad for developing demos! Thus creating a debug mode.
    pio.renderers.default = "colab"
else:
    pio.renderers.default = "png"

In [3]:
# Import stuff
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import einops
from fancy_einsum import einsum
import tqdm.notebook as tqdm
import random
from pathlib import Path
import plotly.express as px
from torch.utils.data import DataLoader

from jaxtyping import Float, Int
from typing import List, Union, Optional
from functools import partial
import copy

import itertools
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
import dataclasses
import datasets
from IPython.display import HTML

In [4]:
# import pysvelte

import transformer_lens
import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookedRootModule,
    HookPoint,
)  # Hooking utilities
from transformer_lens import HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache

We turn automatic differentiation off, to save GPU memory, as this notebook focuses on model inference not model training.

In [5]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x79a525cffa30>

Plotting helper functions:

In [6]:
def imshow(tensor, renderer=None, **kwargs):
    px.imshow(utils.to_numpy(tensor), color_continuous_midpoint=0.0, color_continuous_scale="RdBu", **kwargs).show(renderer)

def line(tensor, renderer=None, **kwargs):
    px.line(y=utils.to_numpy(tensor), **kwargs).show(renderer)

def scatter(x, y, xaxis="", yaxis="", caxis="", renderer=None, **kwargs):
    x = utils.to_numpy(x)
    y = utils.to_numpy(y)
    px.scatter(y=y, x=x, labels={"x":xaxis, "y":yaxis, "color":caxis}, **kwargs).show(renderer)

## Load Model

Decide which model to use (eg. gpt2-small vs -medium)

In [7]:
model = HookedTransformer.from_pretrained(
    "gpt2-small",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    refactor_factored_attn_matrices=True,
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Loaded pretrained model gpt2-small into HookedTransformer


## Import functions from repo

In [8]:
!git clone https://github.com/callummcdougall/ARENA_2.0.git

Cloning into 'ARENA_2.0'...
remote: Enumerating objects: 9100, done.[K
remote: Counting objects: 100% (1814/1814), done.[K
remote: Compressing objects: 100% (288/288), done.[K
remote: Total 9100 (delta 1609), reused 1602 (delta 1523), pack-reused 7286[K
Receiving objects: 100% (9100/9100), 155.60 MiB | 32.07 MiB/s, done.
Resolving deltas: 100% (5502/5502), done.


In [9]:
cd ARENA_2.0/chapter1_transformers/exercises/part3_indirect_object_identification

/content/ARENA_2.0/chapter1_transformers/exercises/part3_indirect_object_identification


In [10]:
import ioi_circuit_extraction as ioi_circuit_extraction

# Generate dataset with multiple prompts

In [12]:
class Dataset:
    def __init__(self, prompts, tokenizer, S1_is_first=False):
        self.prompts = prompts
        self.tokenizer = tokenizer
        self.N = len(prompts)
        self.max_len = max(
            [
                len(self.tokenizer(prompt["text"]).input_ids)
                for prompt in self.prompts
            ]
        )
        # all_ids = [prompt["TEMPLATE_IDX"] for prompt in self.ioi_prompts]
        all_ids = [0 for prompt in self.prompts] # only 1 template
        all_ids_ar = np.array(all_ids)
        self.groups = []
        for id in list(set(all_ids)):
            self.groups.append(np.where(all_ids_ar == id)[0])

        texts = [ prompt["text"] for prompt in self.prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )
        self.io_tokenIDs = [
            self.tokenizer.encode(" " + prompt["S5"])[0] for prompt in self.prompts
        ]
        self.s_tokenIDs = [
            self.tokenizer.encode(" " + prompt["S4"])[0] for prompt in self.prompts
        ]

        # word_idx: for every prompt, find the token index of each target token and "end"
        # word_idx is a tensor with an element for each prompt. The element is the targ token's ind at that prompt
        self.word_idx = {}
        for targ in [key for key in self.prompts[0].keys() if (key != 'text' and key != 'S5')]:
            targ_lst = []
            for prompt in self.prompts:
                input_text = prompt["text"]
                tokens = model.tokenizer.tokenize(input_text)
                # if S1_is_first and targ == "S1":  # only use this if first token doesn't have space Ġ in front
                #     target_token = prompt[targ]
                # else:
                #     target_token = "Ġ" + prompt[targ]
                target_token = "Ġ" + prompt[targ]
                target_index = tokens.index(target_token)
                targ_lst.append(target_index)
            self.word_idx[targ] = torch.tensor(targ_lst)

        targ_lst = []
        for prompt in self.prompts:
            input_text = prompt["text"]
            tokens = self.tokenizer.tokenize(input_text)
            end_token_index = len(tokens) - 1
            targ_lst.append(end_token_index)
        self.word_idx["end"] = torch.tensor(targ_lst)

    def __len__(self):
        return self.N

Repalce io_tokens with correct answer (next, which is '5') and s_tokens with incorrect (current, which repeats)

In [13]:
def generate_prompts_list(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+3),
            'S5': str(i+4),
            'text': f"Adam is {i}. Bob is {i+1}. Claire is {i+2}. Don is {i+3}. Eve is"
        }
        prompts_list.append(prompt_dict)
    return prompts_list

prompts_list = generate_prompts_list(1, 11)
dataset = Dataset(prompts_list, model.tokenizer, S1_is_first=True)

In [14]:
def generate_prompts_list_corr(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+2),
            'S5': str(i+3),
            'text': f"Adam is {i}. Bob is {i+1}. Claire is {i+2}. Don is {i+2}. Eve is"
        }
        prompts_list.append(prompt_dict)
    return prompts_list

prompts_list_2 = generate_prompts_list_corr(1, 11)
dataset_2 = Dataset(prompts_list_2, model.tokenizer, S1_is_first=True)

Logit diff is correct - incorr token. Here, correct is S5, and incorr is S4.

Because of this, it's possible to have logit diffs HIGHER than the "full circuit" because the correct token will still be at first place, but the logit scores assigned will just be bigger (perhaps incorrect is scored even lower in the non-full circuit with a higher logit diff score)?

# Ablation Expm Functions

In [15]:
from torch import Tensor

def logits_to_ave_logit_diff_2(logits: Float[Tensor, "batch seq d_vocab"], dataset: Dataset, per_prompt=False):
    '''
    Returns logit difference between the correct and incorrect answer.

    If per_prompt=True, return the array of differences rather than the average.
    '''

    # Only the final logits are relevant for the answer
    # Get the logits corresponding to the indirect object / subject tokens respectively
    io_logits: Float[Tensor, "batch"] = logits[range(logits.size(0)), dataset.word_idx["end"], dataset.io_tokenIDs]
    s_logits: Float[Tensor, "batch"] = logits[range(logits.size(0)), dataset.word_idx["end"], dataset.s_tokenIDs]
    # Find logit difference
    answer_logit_diff = io_logits - s_logits
    return answer_logit_diff if per_prompt else answer_logit_diff.mean()

In [15]:
def mean_ablate_by_lst(CIRCUIT, SEQ_POS_TO_KEEP, model, print_output=True):
    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        # print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        # print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

We can also prevent redundant computation of the full circuit score by storing it and just passing it in to the function.

## test fns on pure digits dataset

In [16]:
class Dataset:
    def __init__(self, prompts, tokenizer, S1_is_first=False):
        self.prompts = prompts
        self.tokenizer = tokenizer
        self.N = len(prompts)
        self.max_len = max(
            [
                len(self.tokenizer(prompt["text"]).input_ids)
                for prompt in self.prompts
            ]
        )
        # all_ids = [prompt["TEMPLATE_IDX"] for prompt in self.ioi_prompts]
        all_ids = [0 for prompt in self.prompts] # only 1 template
        all_ids_ar = np.array(all_ids)
        self.groups = []
        for id in list(set(all_ids)):
            self.groups.append(np.where(all_ids_ar == id)[0])

        texts = [ prompt["text"] for prompt in self.prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )
        self.io_tokenIDs = [
            self.tokenizer.encode(" " + prompt["S5"])[0] for prompt in self.prompts
        ]
        self.s_tokenIDs = [
            self.tokenizer.encode(" " + prompt["S4"])[0] for prompt in self.prompts
        ]

        # word_idx: for every prompt, find the token index of each target token and "end"
        # word_idx is a tensor with an element for each prompt. The element is the targ token's ind at that prompt
        self.word_idx = {}
        for targ in [key for key in self.prompts[0].keys() if (key != 'text' and key != 'S5')]:
            targ_lst = []
            for prompt in self.prompts:
                input_text = prompt["text"]
                tokens = model.tokenizer.tokenize(input_text)
                if S1_is_first and targ == "S1":  # only use this if first token doesn't have space Ġ in front
                    target_token = prompt[targ]
                else:
                    target_token = "Ġ" + prompt[targ]
                target_index = tokens.index(target_token)
                targ_lst.append(target_index)
            self.word_idx[targ] = torch.tensor(targ_lst)

        targ_lst = []
        for prompt in self.prompts:
            input_text = prompt["text"]
            tokens = self.tokenizer.tokenize(input_text)
            end_token_index = len(tokens) - 1
            targ_lst.append(end_token_index)
        self.word_idx["end"] = torch.tensor(targ_lst)

    def __len__(self):
        return self.N

Repalce io_tokens with correct answer (next, which is '5') and s_tokens with incorrect (current, which repeats)

In [17]:
def generate_prompts_list(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+3),
            'S5': str(i+4),
            'text': f"{i} {i+1} {i+2} {i+3}"
        }
        prompts_list.append(prompt_dict)
    return prompts_list

prompts_list = generate_prompts_list(1, 11)
dataset = Dataset(prompts_list, model.tokenizer, S1_is_first=True)

In [18]:
def generate_prompts_list_corr(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+2),
            'S5': str(i+3),
            'text': f"{i} {i+1} {i+2} {i+2}"
        }
        prompts_list.append(prompt_dict)
    return prompts_list

prompts_list_2 = generate_prompts_list_corr(1, 11)
dataset_2 = Dataset(prompts_list_2, model.tokenizer, S1_is_first=True)

In [19]:
model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook
ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)
logits_to_ave_logit_diff_2(ioi_logits_original, dataset)

tensor(4.6238, device='cuda:0')

In [20]:
fullcirc = [(layer, head) for layer in range(12) for head in range(12)]
CIRCUIT = {
    "number mover": fullcirc,
    "number mover 4": fullcirc,
    "number mover 3": fullcirc,
    "number mover 2": fullcirc,
    "number mover 1": fullcirc,
}

SEQ_POS_TO_KEEP = {
    "number mover": "end",
    "number mover 4": "S4",
    "number mover 3": "S3",
    "number mover 2": "S2",
    "number mover 1": "S1",
}

In [21]:
model_abl = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
ioi_logits_minimal = model_abl(dataset.toks)

In [22]:
logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)

tensor(4.6238, device='cuda:0')

This works, so if any issues with mean ablate 'no heads' is not same score as 'original' for 'among words', the issue is not with the code above but with something in how names pos is "not kept"

# Ablate the model and compare with original

## keep pos of nums and end only

https://colab.research.google.com/drive/1CHRn-AMko9RNrl1bqiCwB7DS-rz1CoBP#scrollTo=KZiVdGTC6QlP&line=2&uniqifier=1

In [None]:
fullcirc = [(0, 1), (0, 3), (0, 5), (0, 7), (0, 9), (0, 10), (1, 5), (2, 2), (2, 9), (3, 0), (3, 3), (3, 7), (4, 4), (5, 5), (6, 1), (6, 6), (6, 9), (6, 10), (7, 10), (7, 11), (8, 8), (9, 1), (10, 7)]

CIRCUIT = {
    "number mover": fullcirc,
    "number mover 4": fullcirc,
    "number mover 3": fullcirc,
    "number mover 2": fullcirc,
    "number mover 1": fullcirc,
}

SEQ_POS_TO_KEEP = {
    "number mover": "end",
    "number mover 4": "S4",
    "number mover 3": "S3",
    "number mover 2": "S2",
    "number mover 1": "S1",
}

mean_ablate_by_lst(CIRCUIT, SEQ_POS_TO_KEEP, model, print_output=False).item()

23.74461555480957

In [None]:
circuit = [(0, 1), (0, 3), (0, 5), (0, 7), (0, 9), (0, 10), (1, 5), (2, 2), (2, 9), (3, 0), (3, 3), (3, 7), (4, 4), (5, 5), (6, 1), (6, 6), (6, 9), (6, 10), (7, 10), (7, 11), (8, 8), (9, 1), (10, 7)]

## keep pos of names, nums, end

In [None]:
def generate_prompts_list(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            # 'Adam': 'Adam',
            'Bob': 'Bob',
            'Claire': 'Claire',
            'Don': 'Don',
            'Eve': 'Eve',
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+3),
            'S5': str(i+4),
            'text': f"Adam is {i}. Bob is {i+1}. Claire is {i+2}. Don is {i+3}. Eve is"
        }
        prompts_list.append(prompt_dict)
    return prompts_list

prompts_list = generate_prompts_list(1, 11)
dataset = Dataset(prompts_list, model.tokenizer, S1_is_first=True)

In [None]:
def generate_prompts_list_corr(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            # 'Adam': 'Adam',
            'Bob': 'Bob',
            'Claire': 'Claire',
            'Don': 'Don',
            'Eve': 'Eve',
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+2),
            'S5': str(i+3),
            'text': f"Adam is {i}. Bob is {i+1}. Claire is {i+2}. Don is {i+2}. Eve is"
        }
        prompts_list.append(prompt_dict)
    return prompts_list

prompts_list_2 = generate_prompts_list_corr(1, 11)
dataset_2 = Dataset(prompts_list_2, model.tokenizer, S1_is_first=True)

In [None]:
fullcirc = [(0, 1), (0, 3), (0, 5), (0, 7), (0, 9), (0, 10), (1, 5), (2, 2), (2, 9), (3, 0), (3, 3), (3, 7), (4, 4), (5, 5), (6, 1), (6, 6), (6, 9), (6, 10), (7, 10), (7, 11), (8, 8), (9, 1), (10, 7)]

CIRCUIT = {
    "name mover 4": fullcirc,
    "name mover 3": fullcirc,
    "name mover 2": fullcirc,
    "name mover 1": fullcirc,
    "number mover": fullcirc,
    "number mover 4": fullcirc,
    "number mover 3": fullcirc,
    "number mover 2": fullcirc,
    "number mover 1": fullcirc,
}

SEQ_POS_TO_KEEP = {
    "name mover 4": "Eve",
    "name mover 3": "Don",
    "name mover 2": "Claire",
    "name mover 1": "Bob",
    "number mover": "end",
    "number mover 4": "S4",
    "number mover 3": "S3",
    "number mover 2": "S2",
    "number mover 1": "S1",
}

mean_ablate_by_lst(CIRCUIT, SEQ_POS_TO_KEEP, model, print_output=False).item()

23.767974853515625

## among names only

### test prompts

In [45]:
modeltest = HookedTransformer.from_pretrained("gpt2-small")

Loaded pretrained model gpt2-small into HookedTransformer


In [46]:
example_prompt = "table 1 lamp 2 fridge 3 chair 4 hat"
example_answer = " 5"
utils.test_prompt(example_prompt, example_answer, modeltest, prepend_bos=True)

Tokenized prompt: ['<|endoftext|>', 'table', ' 1', ' lamp', ' 2', ' fridge', ' 3', ' chair', ' 4', ' hat']
Tokenized answer: [' 5']


Top 0th token. Logit: 17.75 Prob: 90.18% Token: | 5|
Top 1th token. Logit: 13.12 Prob:  0.88% Token: |ches|
Top 2th token. Logit: 12.61 Prob:  0.53% Token: | 1|
Top 3th token. Logit: 12.52 Prob:  0.48% Token: |
|
Top 4th token. Logit: 12.46 Prob:  0.45% Token: |chet|
Top 5th token. Logit: 12.41 Prob:  0.43% Token: | 4|
Top 6th token. Logit: 12.40 Prob:  0.43% Token: | 50|
Top 7th token. Logit: 12.35 Prob:  0.41% Token: |cher|
Top 8th token. Logit: 12.26 Prob:  0.37% Token: |chery|
Top 9th token. Logit: 11.86 Prob:  0.25% Token: | 6|


In [47]:
example_prompt = "table 1 lamp table 2 fridge 3 chair 4 hat"
example_answer = " 5"
utils.test_prompt(example_prompt, example_answer, modeltest, prepend_bos=True)

Tokenized prompt: ['<|endoftext|>', 'table', ' 1', ' lamp', ' table', ' 2', ' fridge', ' 3', ' chair', ' 4', ' hat']
Tokenized answer: [' 5']


Top 0th token. Logit: 16.60 Prob: 74.12% Token: | 5|
Top 1th token. Logit: 13.08 Prob:  2.20% Token: | 4|
Top 2th token. Logit: 12.74 Prob:  1.56% Token: | table|
Top 3th token. Logit: 12.71 Prob:  1.52% Token: |chery|
Top 4th token. Logit: 12.46 Prob:  1.18% Token: |ches|
Top 5th token. Logit: 12.33 Prob:  1.04% Token: |
|
Top 6th token. Logit: 12.26 Prob:  0.97% Token: | 1|
Top 7th token. Logit: 12.16 Prob:  0.88% Token: |chet|
Top 8th token. Logit: 11.67 Prob:  0.54% Token: | 6|
Top 9th token. Logit: 11.58 Prob:  0.49% Token: |cher|


In [None]:
example_prompt = "Adam 1 Bob 2 Claire 3 Don 4 Eve"
example_answer = " 5"
utils.test_prompt(example_prompt, example_answer, modeltest, prepend_bos=True)

Tokenized prompt: ['<|endoftext|>', 'Adam', ' 1', ' Bob', ' 2', ' Claire', ' 3', ' Don', ' 4', ' Eve']
Tokenized answer: [' 5']


Top 0th token. Logit: 18.17 Prob: 92.81% Token: | 5|
Top 1th token. Logit: 14.02 Prob:  1.46% Token: | 6|
Top 2th token. Logit: 12.82 Prob:  0.44% Token: | 10|
Top 3th token. Logit: 12.68 Prob:  0.38% Token: |
|
Top 4th token. Logit: 12.56 Prob:  0.34% Token: |lyn|
Top 5th token. Logit: 12.39 Prob:  0.29% Token: | 4|
Top 6th token. Logit: 12.09 Prob:  0.21% Token: |5|
Top 7th token. Logit: 11.89 Prob:  0.17% Token: | 1|
Top 8th token. Logit: 11.84 Prob:  0.16% Token: | 7|
Top 9th token. Logit: 11.79 Prob:  0.16% Token: | 50|


In [None]:
example_prompt = "Adam 1 Bob 2 Claire 3 Don 4 Eve"
example_answer = " 5"

model_abl = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
utils.test_prompt(example_prompt, example_answer, model_abl, prepend_bos=True)

Tokenized prompt: ['<|endoftext|>', 'Adam', ' 1', ' Bob', ' 2', ' Claire', ' 3', ' Don', ' 4', ' Eve']
Tokenized answer: [' 5']


RuntimeError: ignored

### test get rep tok index

In [None]:
tokens = model.tokenizer.tokenize('Adam 1 Bob 2 Claire 3 Don 4 Eve')
tokens

['Adam', 'Ġ1', 'ĠBob', 'Ġ2', 'ĠClaire', 'Ġ3', 'ĠDon', 'Ġ4', 'ĠEve']

In [None]:
target_token = "Ġ" + "Claire"
target_index = tokens.index(target_token)
target_index

4

### ablate

In [None]:
class Dataset:
    def __init__(self, prompts, tokenizer, S1_is_first=False):
        self.prompts = prompts
        self.tokenizer = tokenizer
        self.N = len(prompts)
        self.max_len = max(
            [
                len(self.tokenizer(prompt["text"]).input_ids)
                for prompt in self.prompts
            ]
        )
        # all_ids = [prompt["TEMPLATE_IDX"] for prompt in self.ioi_prompts]
        all_ids = [0 for prompt in self.prompts] # only 1 template
        all_ids_ar = np.array(all_ids)
        self.groups = []
        for id in list(set(all_ids)):
            self.groups.append(np.where(all_ids_ar == id)[0])

        texts = [ prompt["text"] for prompt in self.prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )
        self.io_tokenIDs = [
            self.tokenizer.encode(" " + prompt["S5"])[0] for prompt in self.prompts
        ]
        self.s_tokenIDs = [
            self.tokenizer.encode(" " + prompt["S4"])[0] for prompt in self.prompts
        ]

        # word_idx: for every prompt, find the token index of each target token and "end"
        # word_idx is a tensor with an element for each prompt. The element is the targ token's ind at that prompt
        self.word_idx = {}
        for targ in [key for key in self.prompts[0].keys() if (key != 'text' and key != 'S5')]:
            targ_lst = []
            for prompt in self.prompts:
                input_text = prompt["text"]
                tokens = model.tokenizer.tokenize(input_text)
                if S1_is_first and targ == "Adam":  # only use this if first token doesn't have space Ġ in front
                    target_token = prompt[targ]
                else:
                    target_token = "Ġ" + prompt[targ]
                target_index = tokens.index(target_token)
                targ_lst.append(target_index)
            self.word_idx[targ] = torch.tensor(targ_lst)

        targ_lst = []
        for prompt in self.prompts:
            input_text = prompt["text"]
            tokens = self.tokenizer.tokenize(input_text)
            end_token_index = len(tokens) - 1
            targ_lst.append(end_token_index)
        self.word_idx["end"] = torch.tensor(targ_lst)

    def __len__(self):
        return self.N

https://github.com/callummcdougall/ARENA_2.0/blob/main/chapter1_transformers/exercises/part3_indirect_object_identification/ioi_dataset.py

ioi_prompt["IO"] = name_1

In [None]:
def generate_prompts_list(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'Adam': 'Adam',
            'Bob': 'Bob',
            'Claire': 'Claire',
            'Don': 'Don',
            'Eve': 'Eve',
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+3),
            'S5': str(i+4),
            'text': f"Adam {i} Bob {i+1} Claire {i+2} Don {i+3} Eve"
        }
        prompts_list.append(prompt_dict)
    return prompts_list

prompts_list = generate_prompts_list(1, 11)
dataset = Dataset(prompts_list, model.tokenizer, S1_is_first=True)

In [None]:
dataset.word_idx

{'Adam': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'Bob': tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
 'Claire': tensor([4, 4, 4, 4, 4, 4, 4, 4, 4, 4]),
 'Don': tensor([6, 6, 6, 6, 6, 6, 6, 6, 6, 6]),
 'Eve': tensor([8, 8, 8, 8, 8, 8, 8, 8, 8, 8]),
 'S1': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'S2': tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3]),
 'S3': tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5]),
 'S4': tensor([7, 7, 7, 7, 7, 7, 7, 7, 7, 7]),
 'end': tensor([8, 8, 8, 8, 8, 8, 8, 8, 8, 8])}

In [None]:
def generate_prompts_list_corr(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'Adam': 'Adam',
            'Bob': 'Bob',
            'Claire': 'Claire',
            'Don': 'Don',
            'Eve': 'Eve',
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+2),
            'S5': str(i+3),
            'text': f"Adam {i} Bob {i+1} Claire {i+2} Don {i+2} Eve"
        }
        prompts_list.append(prompt_dict)
    return prompts_list

prompts_list_2 = generate_prompts_list_corr(1, 11)
dataset_2 = Dataset(prompts_list_2, model.tokenizer, S1_is_first=True)

In [None]:
dataset_2.word_idx

{'Adam': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'Bob': tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
 'Claire': tensor([4, 4, 4, 4, 4, 4, 4, 4, 4, 4]),
 'Don': tensor([6, 6, 6, 6, 6, 6, 6, 6, 6, 6]),
 'Eve': tensor([8, 8, 8, 8, 8, 8, 8, 8, 8, 8]),
 'S1': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'S2': tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3]),
 'S3': tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5]),
 'S4': tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5]),
 'end': tensor([8, 8, 8, 8, 8, 8, 8, 8, 8, 8])}

In [None]:
def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        "name mover 4": lst,
        "name mover 3": lst,
        "name mover 2": lst,
        "name mover 1": lst,
        "name mover 0": lst,
        "number mover": lst,
        "number mover 4": lst,
        "number mover 3": lst,
        "number mover 2": lst,
        "number mover 1": lst,
    }

    SEQ_POS_TO_KEEP = {
        "name mover 4": "Eve",
        "name mover 3": "Don",
        "name mover 2": "Claire",
        "name mover 1": "Bob",
        "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

In [None]:
model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook
ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)
logits_to_ave_logit_diff_2(ioi_logits_original, dataset)

tensor(5.4184, device='cuda:0')

If don't reset hooks, may accidentally use a hooked model (will STILL happen if renamed the variable that takes the value, so add_mean_ablation hook is by ref, not new copy!)

In [None]:
lst = [(layer, head) for layer in range(12) for head in range(12)]
CIRCUIT = {
    "name mover 4": lst,
    "name mover 3": lst,
    "name mover 2": lst,
    "name mover 1": lst,
    "name mover 0": lst,
    "number mover": lst,
    "number mover 4": lst,
    "number mover 3": lst,
    "number mover 2": lst,
    "number mover 1": lst,
}
SEQ_POS_TO_KEEP = {
    "name mover 4": "Eve",
    "name mover 3": "Don",
    "name mover 2": "Claire",
    "name mover 1": "Bob",
    "name mover 0": "Adam",
    "number mover": "end",
    "number mover 4": "S4",
    "number mover 3": "S3",
    "number mover 2": "S2",
    "number mover 1": "S1",
}

In [None]:
model_abl = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
ioi_logits_minimal = model_abl(dataset.toks)
logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)

KeyError: ignored

tensor(2.5386, device='cuda:0')

So this means we're not getting the full circuit. What's missing?

In [None]:
lst = [(layer, head) for layer in range(12) for head in range(12)]
CIRCUIT = {
    "number mover": lst,
    "number mover 4": lst,
    "number mover 3": lst,
    "number mover 2": lst,
    "number mover 1": lst,
}
SEQ_POS_TO_KEEP = {
    "number mover": "end",
    "number mover 4": "S4",
    "number mover 3": "S3",
    "number mover 2": "S2",
    "number mover 1": "S1",
}

model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook
model_abl = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
ioi_logits_minimal = model_abl(dataset.toks)
logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)

tensor(2.5517, device='cuda:0')

This is nearly the same score, so the names are just not being kept but bc the score is diff, something is happening.

## try diff ways to keep names

In [None]:
class Dataset:
    def __init__(self, prompts, tokenizer, S1_is_first=False):
        self.prompts = prompts
        self.tokenizer = tokenizer
        self.N = len(prompts)
        self.max_len = max(
            [
                len(self.tokenizer(prompt["text"]).input_ids)
                for prompt in self.prompts
            ]
        )
        # all_ids = [prompt["TEMPLATE_IDX"] for prompt in self.ioi_prompts]
        all_ids = [0 for prompt in self.prompts] # only 1 template
        all_ids_ar = np.array(all_ids)
        self.groups = []
        for id in list(set(all_ids)):
            self.groups.append(np.where(all_ids_ar == id)[0])

        texts = [ prompt["text"] for prompt in self.prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )
        self.io_tokenIDs = [
            self.tokenizer.encode(" " + prompt["S5"])[0] for prompt in self.prompts
        ]
        self.s_tokenIDs = [
            self.tokenizer.encode(" " + prompt["S4"])[0] for prompt in self.prompts
        ]

        # word_idx: for every prompt, find the token index of each target token and "end"
        # word_idx is a tensor with an element for each prompt. The element is the targ token's ind at that prompt
        self.word_idx = {}
        for targ in [key for key in self.prompts[0].keys() if (key != 'text' and key != 'S5')]:
            targ_lst = []
            for prompt in self.prompts:
                input_text = prompt["text"]
                tokens = model.tokenizer.tokenize(input_text)
                if S1_is_first and targ == "Adam1":  # only use this if first token doesn't have space Ġ in front
                    target_token = prompt[targ]
                else:
                    target_token = "Ġ" + prompt[targ]
                target_index = tokens.index(target_token)
                targ_lst.append(target_index)
            self.word_idx[targ] = torch.tensor(targ_lst)

        targ_lst = []
        for prompt in self.prompts:
            input_text = prompt["text"]
            tokens = self.tokenizer.tokenize(input_text)
            end_token_index = len(tokens) - 1
            targ_lst.append(end_token_index)
        self.word_idx["end"] = torch.tensor(targ_lst)

    def __len__(self):
        return self.N

In [None]:
def generate_prompts_list(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'Adam1': 'Adam',
            'Bob1': 'Bob',
            'Claire1': 'Claire',
            'Don1': 'Don',
            'Eve1': 'Eve',
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+3),
            'S5': str(i+4),
            'text': f"Adam {i} Bob {i+1} Claire {i+2} Don {i+3} Eve"
        }
        prompts_list.append(prompt_dict)
    return prompts_list

prompts_list = generate_prompts_list(1, 11)
dataset = Dataset(prompts_list, model.tokenizer, S1_is_first=True)

In [None]:
def generate_prompts_list_corr(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'Adam1': 'Adam',
            'Bob1': 'Bob',
            'Claire1': 'Claire',
            'Don1': 'Don',
            'Eve1': 'Eve',
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+2),
            'S5': str(i+3),
            'text': f"Adam {i} Bob {i+1} Claire {i+2} Don {i+2} Eve"
        }
        prompts_list.append(prompt_dict)
    return prompts_list

prompts_list_2 = generate_prompts_list_corr(1, 11)
dataset_2 = Dataset(prompts_list_2, model.tokenizer, S1_is_first=True)

In [None]:
CIRCUIT = {
        "name mover 4": lst,
        "name mover 3": lst,
        "name mover 2": lst,
        "name mover 1": lst,
        "name mover 0": lst,
        "number mover": lst,
        "number mover 4": lst,
        "number mover 3": lst,
        "number mover 2": lst,
        "number mover 1": lst,
    }

SEQ_POS_TO_KEEP = {  # the value is not token, but key to token in prompt_dict dataset
    "name mover 4": "Eve1",
    "name mover 3": "Don1",
    "name mover 2": "Claire1",
    "name mover 1": "Bob1",
    "name mover 0": "Adam1",
    "number mover": "end",
    "number mover 4": "S4",
    "number mover 3": "S3",
    "number mover 2": "S2",
    "number mover 1": "S1",
}

In [None]:
model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook
model_abl = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
ioi_logits_minimal = model_abl(dataset.toks)
logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)

tensor(2.5386, device='cuda:0')

This shows it's not an issue with the keys being the same string value as the tokens.

## fix seq pos issue

SOLN: the corrupted had repeated tokens in “Adam 1 Bob 2 Claire 3 Don 3 Eve”, so the repeated query seq pos index (the second 3) was not kept (non-ablated) when running tokens.index(target_token). The previous dataset of “1 2 3 3” did not have this issue as the datasets always kept the query end pos non-ablated, which was coincidentally on the second 3. But in the new case, the last token was “Eve” so this did not occur.

In [None]:
class Dataset:
    def __init__(self, prompts, pos_dict, tokenizer, S1_is_first=False):
        self.prompts = prompts
        self.tokenizer = tokenizer
        self.N = len(prompts)
        self.max_len = max(
            [
                len(self.tokenizer(prompt["text"]).input_ids)
                for prompt in self.prompts
            ]
        )
        # all_ids = [prompt["TEMPLATE_IDX"] for prompt in self.ioi_prompts]
        all_ids = [0 for prompt in self.prompts] # only 1 template
        all_ids_ar = np.array(all_ids)
        self.groups = []
        for id in list(set(all_ids)):
            self.groups.append(np.where(all_ids_ar == id)[0])

        texts = [ prompt["text"] for prompt in self.prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )
        self.io_tokenIDs = [
            self.tokenizer.encode(" " + prompt["corr"])[0] for prompt in self.prompts
        ]
        self.s_tokenIDs = [
            self.tokenizer.encode(" " + prompt["incorr"])[0] for prompt in self.prompts
        ]

        # word_idx: for every prompt, find the token index of each target token and "end"
        # word_idx is a tensor with an element for each prompt. The element is the targ token's ind at that prompt
        self.word_idx = {}
        for targ in [key for key in self.prompts[0].keys() if (key != 'text' and key != 'corr' and key != 'incorr')]:
            targ_lst = []
            for prompt in self.prompts:
                input_text = prompt["text"]
                tokens = model.tokenizer.tokenize(input_text)
                # if S1_is_first and targ == "S1":  # only use this if first token doesn't have space Ġ in front
                #     target_token = prompt[targ]
                # else:
                #     target_token = "Ġ" + prompt[targ]
                # target_index = tokens.index(target_token)
                target_index = pos_dict[targ]
                targ_lst.append(target_index)
            self.word_idx[targ] = torch.tensor(targ_lst)

        targ_lst = []
        for prompt in self.prompts:
            input_text = prompt["text"]
            tokens = self.tokenizer.tokenize(input_text)
            end_token_index = len(tokens) - 1
            targ_lst.append(end_token_index)
        self.word_idx["end"] = torch.tensor(targ_lst)

    def __len__(self):
        return self.N

In [None]:
pos_dict = {
    'Adam': 0,
    'Bob': 2,
    'Claire': 4,
    'Don': 6,
    'Eve': 8,
    'S1': 1,
    'S2': 3,
    'S3': 5,
    'S4': 7,
}

In [None]:
def generate_prompts_list(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'Adam': 'Adam',
            'Bob': 'Bob',
            'Claire': 'Claire',
            'Don': 'Don',
            'Eve': 'Eve',
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+3),
            'corr': str(i+4),
            'incorr': str(i+3),
            'text': f"Adam {i} Bob {i+1} Claire {i+2} Don {i+3} Eve"
        }
        prompts_list.append(prompt_dict)
    return prompts_list, pos_dict

prompts_list, pos_dict = generate_prompts_list(1, 11)
dataset = Dataset(prompts_list, pos_dict, model.tokenizer, S1_is_first=True)

In [None]:
def generate_prompts_list_corr(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'Adam': 'Adam',
            'Bob': 'Bob',
            'Claire': 'Claire',
            'Don': 'Don',
            'Eve': 'Eve',
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+2),
            'corr': str(i+3),
            'incorr': str(i+4),
            'text': f"Adam {i} Bob {i+1} Claire {i+2} Don {i+2} Eve"
        }
        prompts_list.append(prompt_dict)
    return prompts_list

prompts_list_2 = generate_prompts_list_corr(1, 11)
dataset_2 = Dataset(prompts_list_2, pos_dict, model.tokenizer, S1_is_first=True)

In [None]:
def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        "name mover 4": lst,
        "name mover 3": lst,
        "name mover 2": lst,
        "name mover 1": lst,
        "name mover 0": lst,
        "number mover": lst,
        "number mover 4": lst,
        "number mover 3": lst,
        "number mover 2": lst,
        "number mover 1": lst,
    }

    SEQ_POS_TO_KEEP = {
        "name mover 4": "Eve",
        "name mover 3": "Don",
        "name mover 2": "Claire",
        "name mover 1": "Bob",
        "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

In [None]:
model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook
ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)
logits_to_ave_logit_diff_2(ioi_logits_original, dataset)

tensor(5.4184, device='cuda:0')

In [None]:
lst = [(layer, head) for layer in range(12) for head in range(12)]
CIRCUIT = {
    "name mover 4": lst,
    "name mover 3": lst,
    "name mover 2": lst,
    "name mover 1": lst,
    "name mover 0": lst,
    "number mover": lst,
    "number mover 4": lst,
    "number mover 3": lst,
    "number mover 2": lst,
    "number mover 1": lst,
}
SEQ_POS_TO_KEEP = {
    "name mover 4": "Eve",
    "name mover 3": "Don",
    "name mover 2": "Claire",
    "name mover 1": "Bob",
    "name mover 0": "Adam",
    "number mover": "end",
    "number mover 4": "S4",
    "number mover 3": "S3",
    "number mover 2": "S2",
    "number mover 1": "S1",
}

In [None]:
model_abl = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
ioi_logits_minimal = model_abl(dataset.toks)
logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)

tensor(5.4184, device='cuda:0')

## prune backw once

In [None]:
# Start with full circuit
curr_circuit = [(layer, head) for layer in range(12) for head in range(12)]
threshold = 3  # This is T, a %. if performance is less than T%, allow its removal

for layer in range(11, -1, -1):  # go thru all heads in a layer first
    for head in range(12):
        # Copying the curr_circuit so we can iterate over one and modify the other
        copy_circuit = curr_circuit.copy()

        # Temporarily removing the current tuple from the copied circuit
        copy_circuit.remove((layer, head))

        new_score = mean_ablate_by_lst(copy_circuit, model, dataset, dataset_2, print_output=False).item()

        # print((layer,head), new_score)
        # If the result is less than the threshold, remove the tuple from the original list
        if (100 - new_score) < threshold:
            curr_circuit.remove((layer, head))

            print("Removed:", (layer, head))
            print(new_score)
            print("\n")

Removed: (11, 0)
99.68688201904297


Removed: (11, 1)
99.76443481445312


Removed: (11, 2)
99.79711151123047


Removed: (11, 3)
99.8869400024414


Removed: (11, 4)
100.07891845703125


Removed: (11, 5)
100.17557525634766


Removed: (11, 6)
100.26312255859375


Removed: (11, 7)
100.1176986694336


Removed: (11, 8)
100.04216766357422


Removed: (11, 9)
99.93612670898438


Removed: (11, 10)
99.1741943359375


Removed: (11, 11)
99.58927917480469


Removed: (10, 0)
99.56360626220703


Removed: (10, 1)
99.53450775146484


Removed: (10, 2)
103.23387908935547


Removed: (10, 3)
103.28461456298828


Removed: (10, 4)
103.33687591552734


Removed: (10, 5)
102.95157623291016


Removed: (10, 6)
102.95909881591797


Removed: (10, 8)
102.94963836669922


Removed: (10, 9)
102.81712341308594


Removed: (10, 10)
103.00299835205078


Removed: (10, 11)
102.93616485595703


Removed: (9, 0)
102.90625762939453


Removed: (9, 2)
102.79837036132812


Removed: (9, 3)
102.8944091796875


Removed: (9, 4)
103.0067

In [None]:
curr_circuit

[(0, 1),
 (0, 8),
 (0, 10),
 (1, 5),
 (3, 0),
 (4, 4),
 (5, 5),
 (6, 0),
 (6, 1),
 (6, 2),
 (6, 6),
 (6, 10),
 (8, 8),
 (8, 11),
 (9, 1),
 (10, 7)]

## try running among words circ on pure digits

In [None]:
class Dataset:
    def __init__(self, prompts, tokenizer, S1_is_first=False):
        self.prompts = prompts
        self.tokenizer = tokenizer
        self.N = len(prompts)
        self.max_len = max(
            [
                len(self.tokenizer(prompt["text"]).input_ids)
                for prompt in self.prompts
            ]
        )
        # all_ids = [prompt["TEMPLATE_IDX"] for prompt in self.ioi_prompts]
        all_ids = [0 for prompt in self.prompts] # only 1 template
        all_ids_ar = np.array(all_ids)
        self.groups = []
        for id in list(set(all_ids)):
            self.groups.append(np.where(all_ids_ar == id)[0])

        texts = [ prompt["text"] for prompt in self.prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )
        self.io_tokenIDs = [
            self.tokenizer.encode(" " + prompt["S5"])[0] for prompt in self.prompts
        ]
        self.s_tokenIDs = [
            self.tokenizer.encode(" " + prompt["S4"])[0] for prompt in self.prompts
        ]

        # word_idx: for every prompt, find the token index of each target token and "end"
        # word_idx is a tensor with an element for each prompt. The element is the targ token's ind at that prompt
        self.word_idx = {}
        for targ in [key for key in self.prompts[0].keys() if (key != 'text' and key != 'S5')]:
            targ_lst = []
            for prompt in self.prompts:
                input_text = prompt["text"]
                tokens = model.tokenizer.tokenize(input_text)
                if S1_is_first and targ == "S1":  # only use this if first token doesn't have space Ġ in front
                    target_token = prompt[targ]
                else:
                    target_token = "Ġ" + prompt[targ]
                target_index = tokens.index(target_token)
                targ_lst.append(target_index)
            self.word_idx[targ] = torch.tensor(targ_lst)

        targ_lst = []
        for prompt in self.prompts:
            input_text = prompt["text"]
            tokens = self.tokenizer.tokenize(input_text)
            end_token_index = len(tokens) - 1
            targ_lst.append(end_token_index)
        self.word_idx["end"] = torch.tensor(targ_lst)

    def __len__(self):
        return self.N

Repalce io_tokens with correct answer (next, which is '5') and s_tokens with incorrect (current, which repeats)

In [None]:
def generate_prompts_list(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+3),
            'S5': str(i+4),
            'text': f"{i} {i+1} {i+2} {i+3}"
        }
        prompts_list.append(prompt_dict)
    return prompts_list

prompts_list = generate_prompts_list(1, 11)
pureDataset = Dataset(prompts_list, model.tokenizer, S1_is_first=True)

In [None]:
def generate_prompts_list_corr(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+2),
            'S5': str(i+3),
            'text': f"{i} {i+1} {i+2} {i+2}"
        }
        prompts_list.append(prompt_dict)
    return prompts_list

prompts_list_2 = generate_prompts_list_corr(1, 11)
pureDataset_2 = Dataset(prompts_list_2, model.tokenizer, S1_is_first=True)

In [None]:
def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": lst,
        # "name mover 3": lst,
        # "name mover 2": lst,
        # "name mover 1": lst,
        # "name mover 0": lst,
        "number mover": lst,
        "number mover 4": lst,
        "number mover 3": lst,
        "number mover 2": lst,
        "number mover 1": lst,
    }

    SEQ_POS_TO_KEEP = {
        # "name mover 4": "Eve",
        # "name mover 3": "Don",
        # "name mover 2": "Claire",
        # "name mover 1": "Bob",
        # "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

In [None]:
mean_ablate_by_lst(curr_circuit, model, pureDataset, pureDataset_2, print_output=True).item()

Average logit difference (IOI dataset, using entire model): 4.6238
Average logit difference (IOI dataset, only using circuit): 1.1478
Average logit difference (circuit / full) %: 24.8247


24.824731826782227

## ablate using guessed seq pos from attn pats

In [23]:
class Dataset:
    def __init__(self, prompts, pos_dict, tokenizer, S1_is_first=False):
        self.prompts = prompts
        self.tokenizer = tokenizer
        self.N = len(prompts)
        self.max_len = max(
            [
                len(self.tokenizer(prompt["text"]).input_ids)
                for prompt in self.prompts
            ]
        )
        # all_ids = [prompt["TEMPLATE_IDX"] for prompt in self.ioi_prompts]
        all_ids = [0 for prompt in self.prompts] # only 1 template
        all_ids_ar = np.array(all_ids)
        self.groups = []
        for id in list(set(all_ids)):
            self.groups.append(np.where(all_ids_ar == id)[0])

        texts = [ prompt["text"] for prompt in self.prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )
        self.io_tokenIDs = [
            self.tokenizer.encode(" " + prompt["corr"])[0] for prompt in self.prompts
        ]
        self.s_tokenIDs = [
            self.tokenizer.encode(" " + prompt["incorr"])[0] for prompt in self.prompts
        ]

        # word_idx: for every prompt, find the token index of each target token and "end"
        # word_idx is a tensor with an element for each prompt. The element is the targ token's ind at that prompt
        self.word_idx = {}
        for targ in [key for key in self.prompts[0].keys() if (key != 'text' and key != 'corr' and key != 'incorr')]:
            targ_lst = []
            for prompt in self.prompts:
                input_text = prompt["text"]
                tokens = model.tokenizer.tokenize(input_text)
                # if S1_is_first and targ == "S1":  # only use this if first token doesn't have space Ġ in front
                #     target_token = prompt[targ]
                # else:
                #     target_token = "Ġ" + prompt[targ]
                # target_index = tokens.index(target_token)
                target_index = pos_dict[targ]
                targ_lst.append(target_index)
            self.word_idx[targ] = torch.tensor(targ_lst)

        targ_lst = []
        for prompt in self.prompts:
            input_text = prompt["text"]
            tokens = self.tokenizer.tokenize(input_text)
            end_token_index = len(tokens) - 1
            targ_lst.append(end_token_index)
        self.word_idx["end"] = torch.tensor(targ_lst)

    def __len__(self):
        return self.N

In [24]:
pos_dict = {
    'Adam': 0,
    'Bob': 2,
    'Claire': 4,
    'Don': 6,
    'Eve': 8,
    'S1': 1,
    'S2': 3,
    'S3': 5,
    'S4': 7,
}

In [25]:
def generate_prompts_list(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'Adam': 'Adam',
            'Bob': 'Bob',
            'Claire': 'Claire',
            'Don': 'Don',
            'Eve': 'Eve',
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+3),
            'corr': str(i+4),
            'incorr': str(i+3),
            'text': f"Adam {i} Bob {i+1} Claire {i+2} Don {i+3} Eve"
        }
        prompts_list.append(prompt_dict)
    return prompts_list, pos_dict

prompts_list, pos_dict = generate_prompts_list(1, 11)
dataset = Dataset(prompts_list, pos_dict, model.tokenizer, S1_is_first=True)

In [26]:
def generate_prompts_list_corr(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'Adam': 'Adam',
            'Bob': 'Bob',
            'Claire': 'Claire',
            'Don': 'Don',
            'Eve': 'Eve',
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+2),
            'corr': str(i+3),
            'incorr': str(i+4),
            'text': f"Adam {i} Bob {i+1} Claire {i+2} Don {i+2} Eve"
        }
        prompts_list.append(prompt_dict)
    return prompts_list

prompts_list_2 = generate_prompts_list_corr(1, 11)
dataset_2 = Dataset(prompts_list_2, pos_dict, model.tokenizer, S1_is_first=True)

### all pos

In [None]:
curr_circuit = [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10), (8, 8), (8, 11), (9, 1), (10, 7)]
mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

97.355712890625

### late heads only at end

In [None]:
curr_circuit = [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)]

def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        "name mover 4": lst,
        "name mover 3": lst,
        "name mover 2": lst,
        "name mover 1": lst,
        "name mover 0": lst,
        "number mover": [(8, 8), (8, 11), (9, 1), (10, 7)],
        "number mover 4": lst,
        "number mover 3": lst,
        "number mover 2": lst,
        "number mover 1": lst,
    }

    SEQ_POS_TO_KEEP = {
        "name mover 4": "Eve",
        "name mover 3": "Don",
        "name mover 2": "Claire",
        "name mover 1": "Bob",
        "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

98.0891342163086

Amazingly, it's even BETTER?

### late heads only at end, num detect only at nums

In [None]:
curr_circuit = [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10), (8, 8), (8, 11), (9, 1), (10, 7)]

def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        "name mover 4": lst,
        "name mover 3": lst,
        "name mover 2": lst,
        "name mover 1": lst,
        "name mover 0": lst,
        "number mover": [(8, 8), (8, 11), (9, 1), (10, 7)],
        "number mover 4": [(1, 5), (4, 4)],
        "number mover 3": [(1, 5), (4, 4)],
        "number mover 2": [(1, 5), (4, 4)],
        "number mover 1": [(1, 5), (4, 4)],
    }

    SEQ_POS_TO_KEEP = {
        "name mover 4": "Eve",
        "name mover 3": "Don",
        "name mover 2": "Claire",
        "name mover 1": "Bob",
        "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

61.35452651977539

Unfortunately, number detectors do much worse here. But this is an obvious bug- you are ablating ALL heads except 1.5 and 4.4 at the numbers. Clearly, you need more heads than those. So just ablate the heads at the word pos.

In [None]:
curr_circuit = [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)]

def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": lst,
        # "name mover 3": lst,
        # "name mover 2": lst,
        # "name mover 1": lst,
        # "name mover 0": lst,
        "number mover": [(8, 8), (8, 11), (9, 1), (10, 7)],
        "number mover 4": lst,
        "number mover 3": lst,
        "number mover 2": lst,
        "number mover 1": lst,
    }

    SEQ_POS_TO_KEEP = {
        "name mover 4": "Eve",
        "name mover 3": "Don",
        "name mover 2": "Claire",
        "name mover 1": "Bob",
        "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

91.36726379394531

The score is slightly worse. This means SOME parts of the names are still needed. But which parts?

In [None]:
curr_circuit = [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)]

def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        "name mover 4": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)],
        "name mover 3": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)],
        "name mover 2": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)],
        "name mover 1": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)],
        "name mover 0": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)],
        "number mover": [(8, 8), (8, 11), (9, 1), (10, 7)],
        "number mover 4": lst,
        "number mover 3": lst,
        "number mover 2": lst,
        "number mover 1": lst,
    }

    SEQ_POS_TO_KEEP = {
        "name mover 4": "Eve",
        "name mover 3": "Don",
        "name mover 2": "Claire",
        "name mover 1": "Bob",
        "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

98.05598449707031

This shows that 1.5 and 4.4 have no impact on the names at all.

In [None]:
curr_circuit = [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)]

def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5)], # these only make 0.02% diff when kept at the end
        "name mover 3": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5)],
        "name mover 2": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5)],
        "name mover 1": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5)],
        "name mover 0": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5)],
        "number mover": [(8, 8), (8, 11), (9, 1), (10, 7)],
        "number mover 4": lst,
        "number mover 3": lst,
        "number mover 2": lst,
        "number mover 1": lst,
    }

    SEQ_POS_TO_KEEP = {
        "name mover 4": "Eve",
        "name mover 3": "Don",
        "name mover 2": "Claire",
        "name mover 1": "Bob",
        "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

91.3373794555664

So L6 still needed for the names. But 10% is not make or break diff. We can get away with removing it for names.

In [None]:
curr_circuit = [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)]

def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5)], # these only make 0.02% diff when kept at the end
        "name mover 3": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5)],
        "name mover 2": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5)],
        "name mover 1": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5)],
        "name mover 0": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5)],
        "number mover": [(8, 8), (8, 11), (9, 1), (10, 7)],
        "number mover 4": [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (5, 5)],
        "number mover 3": [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (5, 5)],
        "number mover 2": [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (5, 5)],
        "number mover 1": [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (5, 5)],
    }

    SEQ_POS_TO_KEEP = {
        "name mover 4": "Eve",
        "name mover 3": "Don",
        "name mover 2": "Claire",
        "name mover 1": "Bob",
        "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

83.72284698486328

L6 is very impt for numbers; make an 8% diff.

In [None]:
curr_circuit = [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)]

def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        "name mover 4": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)],
        "name mover 3": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)],
        "name mover 2": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)],
        "name mover 1": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)],
        "name mover 0": [(0, 1), (0, 8), (0, 10), (3, 0), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)],
        "number mover": [(8, 8), (8, 11), (9, 1), (10, 7)],
        "number mover 4": [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)],
        "number mover 3": [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)],
        "number mover 2": [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)],
        "number mover 1": [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)],
    }

    SEQ_POS_TO_KEEP = {
        "name mover 4": "Eve",
        "name mover 3": "Don",
        "name mover 2": "Claire",
        "name mover 1": "Bob",
        "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

95.2208023071289

5.5 also makes 3% diff for numbers, but not entirely

## work backw once, 10%

In [35]:
# curr_circuit = [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10)]

def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        "name mover 4": lst,
        "name mover 3": lst,
        "name mover 2": lst,
        "name mover 1": lst,
        "name mover 0": lst,
        "number mover": lst,
        "number mover 4": lst,
        "number mover 3": lst,
        "number mover 2": lst,
        "number mover 1": lst,
    }

    SEQ_POS_TO_KEEP = {
        "name mover 4": "Eve",
        "name mover 3": "Don",
        "name mover 2": "Claire",
        "name mover 1": "Bob",
        "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

curr_circuit = [(layer, head) for layer in range(12) for head in range(12)]
mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

100.0

In [31]:
# Start with full circuit
curr_circuit = [(layer, head) for layer in range(12) for head in range(12)]
# curr_circuit = [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10), (8, 8), (8, 11), (9, 1), (10, 7)]
threshold = 10  # This is T, a %. if performance is less than T%, allow its removal

for layer in range(11, -1, -1):  # go thru all heads in a layer first
    for head in range(12):
        # Copying the curr_circuit so we can iterate over one and modify the other
        copy_circuit = curr_circuit.copy()

        # Temporarily removing the current tuple from the copied circuit
        copy_circuit.remove((layer, head))

        new_score = mean_ablate_by_lst(copy_circuit, model, dataset, dataset_2, print_output=False).item()

        # print((layer,head), new_score)
        # If the result is less than the threshold, remove the tuple from the original list
        if (100 - new_score) < threshold:
            curr_circuit.remove((layer, head))

            print("Removed:", (layer, head))
            print(new_score)
            print("\n")

Removed: (11, 0)
99.6868667602539


Removed: (11, 1)
99.76444244384766


Removed: (11, 2)
99.7970962524414


Removed: (11, 3)
99.88690185546875


Removed: (11, 4)
100.07891845703125


Removed: (11, 5)
100.17558288574219


Removed: (11, 6)
100.26310729980469


Removed: (11, 7)
100.11766815185547


Removed: (11, 8)
100.04216766357422


Removed: (11, 9)
99.93612670898438


Removed: (11, 10)
99.17417907714844


Removed: (11, 11)
99.58924102783203


Removed: (10, 0)
99.5635757446289


Removed: (10, 1)
99.53450775146484


Removed: (10, 2)
103.23390197753906


Removed: (10, 3)
103.28462219238281


Removed: (10, 4)
103.33689880371094


Removed: (10, 5)
102.95158386230469


Removed: (10, 6)
102.95907592773438


Removed: (10, 7)
92.94772338867188


Removed: (10, 8)
92.93510437011719


Removed: (10, 9)
92.80360412597656


Removed: (10, 10)
92.9897232055664


Removed: (10, 11)
92.92302703857422


Removed: (9, 0)
92.9098129272461


Removed: (9, 2)
92.8400650024414


Removed: (9, 3)
92.843505859375


In [37]:
curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)]
mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

0.4319198727607727

In [39]:
curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11)]
mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

41.46913146972656

In [40]:
curr_circuit = [(7, 11), (8, 8), (8, 11), (9, 1)]
mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

46.679283142089844

In [41]:
curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11)]
mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

1.5266976356506348

In [42]:
curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (8, 8), (8, 11)]
mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

35.77225112915039

In [43]:
curr_circuit = [(9, 1)]
mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

29.699447631835938

In [38]:
curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)]
mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

90.38836669921875

In [33]:
curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)]

def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        "name mover 4": lst,
        "name mover 3": lst,
        "name mover 2": lst,
        "name mover 1": lst,
        "name mover 0": lst,
        "number mover": [(7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 4": lst,
        "number mover 3": lst,
        "number mover 2": lst,
        "number mover 1": lst,
    }

    SEQ_POS_TO_KEEP = {
        "name mover 4": "Eve",
        "name mover 3": "Don",
        "name mover 2": "Claire",
        "name mover 1": "Bob",
        "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

91.82762908935547

In [44]:
curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)]

def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": lst,
        # "name mover 3": lst,
        # "name mover 2": lst,
        # "name mover 1": lst,
        # "name mover 0": lst,
        "number mover": [(7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 4": lst,
        "number mover 3": lst,
        "number mover 2": lst,
        "number mover 1": lst,
    }

    SEQ_POS_TO_KEEP = {
        "name mover 4": "Eve",
        "name mover 3": "Don",
        "name mover 2": "Claire",
        "name mover 1": "Bob",
        "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

88.11691284179688

## ablate this circ on random words

In [48]:
pos_dict = {
    'table': 0,
    'lamp': 2,
    'pencil': 4,
    'hat': 6,
    'run': 8,
    'S1': 1,
    'S2': 3,
    'S3': 5,
    'S4': 7,
}

In [49]:
def generate_prompts_list(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'table': 'table',
            'lamp': 'lamp',
            'pencil': 'pencil',
            'hat': 'hat',
            'run': 'run',
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+3),
            'corr': str(i+4),
            'incorr': str(i+3),
            'text': f"table {i} Bob {i+1} pencil {i+2} hat {i+3} run"
        }
        prompts_list.append(prompt_dict)
    return prompts_list, pos_dict

prompts_list, pos_dict = generate_prompts_list(1, 11)
dataset = Dataset(prompts_list, pos_dict, model.tokenizer, S1_is_first=True)

In [50]:
def generate_prompts_list_corr(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'table': 'table',
            'lamp': 'lamp',
            'pencil': 'pencil',
            'hat': 'hat',
            'run': 'run',
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+2),
            'corr': str(i+3),
            'incorr': str(i+4),
            'text': f"table {i} Bob {i+1} pencil {i+2} hat {i+2} run"
        }
        prompts_list.append(prompt_dict)
    return prompts_list

prompts_list_2 = generate_prompts_list_corr(1, 11)
dataset_2 = Dataset(prompts_list_2, pos_dict, model.tokenizer, S1_is_first=True)

In [51]:
curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)]

def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": lst,
        # "name mover 3": lst,
        # "name mover 2": lst,
        # "name mover 1": lst,
        # "name mover 0": lst,
        "number mover": [(7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 4": lst,
        "number mover 3": lst,
        "number mover 2": lst,
        "number mover 1": lst,
    }

    SEQ_POS_TO_KEEP = {
        "name mover 4": "Eve",
        "name mover 3": "Don",
        "name mover 2": "Claire",
        "name mover 1": "Bob",
        "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

mean_ablate_by_lst(curr_circuit, model, dataset, dataset_2, print_output=False).item()

94.33404541015625

This works just as well; actually, it works better!

## try running new circ on pure digits

In [16]:
class Dataset:
    def __init__(self, prompts, tokenizer, S1_is_first=False):
        self.prompts = prompts
        self.tokenizer = tokenizer
        self.N = len(prompts)
        self.max_len = max(
            [
                len(self.tokenizer(prompt["text"]).input_ids)
                for prompt in self.prompts
            ]
        )
        # all_ids = [prompt["TEMPLATE_IDX"] for prompt in self.ioi_prompts]
        all_ids = [0 for prompt in self.prompts] # only 1 template
        all_ids_ar = np.array(all_ids)
        self.groups = []
        for id in list(set(all_ids)):
            self.groups.append(np.where(all_ids_ar == id)[0])

        texts = [ prompt["text"] for prompt in self.prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )
        self.io_tokenIDs = [
            self.tokenizer.encode(" " + prompt["S5"])[0] for prompt in self.prompts
        ]
        self.s_tokenIDs = [
            self.tokenizer.encode(" " + prompt["S4"])[0] for prompt in self.prompts
        ]

        # word_idx: for every prompt, find the token index of each target token and "end"
        # word_idx is a tensor with an element for each prompt. The element is the targ token's ind at that prompt
        self.word_idx = {}
        for targ in [key for key in self.prompts[0].keys() if (key != 'text' and key != 'S5')]:
            targ_lst = []
            for prompt in self.prompts:
                input_text = prompt["text"]
                tokens = model.tokenizer.tokenize(input_text)
                if S1_is_first and targ == "S1":  # only use this if first token doesn't have space Ġ in front
                    target_token = prompt[targ]
                else:
                    target_token = "Ġ" + prompt[targ]
                target_index = tokens.index(target_token)
                targ_lst.append(target_index)
            self.word_idx[targ] = torch.tensor(targ_lst)

        targ_lst = []
        for prompt in self.prompts:
            input_text = prompt["text"]
            tokens = self.tokenizer.tokenize(input_text)
            end_token_index = len(tokens) - 1
            targ_lst.append(end_token_index)
        self.word_idx["end"] = torch.tensor(targ_lst)

    def __len__(self):
        return self.N

In [17]:
def generate_prompts_list(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+3),
            'S5': str(i+4),
            'text': f"{i} {i+1} {i+2} {i+3}"
        }
        prompts_list.append(prompt_dict)
    return prompts_list

prompts_list = generate_prompts_list(1, 11)
pureDataset = Dataset(prompts_list, model.tokenizer, S1_is_first=True)

In [18]:
def generate_prompts_list_corr(x ,y):
    prompts_list = []
    for i in range(x, y):
        prompt_dict = {
            'S1': str(i),
            'S2': str(i+1),
            'S3': str(i+2),
            'S4': str(i+2),
            'S5': str(i+3),
            'text': f"{i} {i+1} {i+2} {i+2}"
        }
        prompts_list.append(prompt_dict)
    return prompts_list

prompts_list_2 = generate_prompts_list_corr(1, 11)
pureDataset_2 = Dataset(prompts_list_2, model.tokenizer, S1_is_first=True)

In [65]:
model.reset_hooks(including_permanent=True)
model_abl = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=pureDataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
ioi_logits_minimal = model_abl(pureDataset.toks)
logits_to_ave_logit_diff_2(ioi_logits_minimal, pureDataset)

tensor(4.6238, device='cuda:0')

In [72]:
def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": lst,
        # "name mover 3": lst,
        # "name mover 2": lst,
        # "name mover 1": lst,
        # "name mover 0": lst,
        # "number mover": [(7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover": lst,
        "number mover 4": lst,
        "number mover 3": lst,
        "number mover 2": lst,
        "number mover 1": lst,
    }

    SEQ_POS_TO_KEEP = {
        # "name mover 4": "Eve",
        # "name mover 3": "Don",
        # "name mover 2": "Claire",
        # "name mover 1": "Bob",
        # "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

In [77]:
curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)]
mean_ablate_by_lst(curr_circuit, model, pureDataset, pureDataset_2, print_output=True).item()

Average logit difference (IOI dataset, using entire model): 4.6238
Average logit difference (IOI dataset, only using circuit): -0.0635
Average logit difference (circuit / full) %: -1.3728


-1.3727796077728271

For some reason, this is really bad. It appears the early tokens are needed for positions??

In [79]:
curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6, 4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)]
mean_ablate_by_lst(curr_circuit, model, pureDataset, pureDataset_2, print_output=True).item()

Average logit difference (IOI dataset, using entire model): 4.6238
Average logit difference (IOI dataset, only using circuit): 1.2746
Average logit difference (circuit / full) %: 27.5656


27.565563201904297

In [75]:
curr_circuit = [(0, 1), (0, 8), (0, 10), (1, 5), (3, 0), (4, 4), (5, 5), (6, 0), (6, 1), (6, 2), (6, 6), (6, 10), (8, 8), (8, 11), (9, 1), (10, 7)]
mean_ablate_by_lst(curr_circuit, model, pureDataset, pureDataset_2, print_output=True).item()

Average logit difference (IOI dataset, using entire model): 4.6238
Average logit difference (IOI dataset, only using circuit): 1.1478
Average logit difference (circuit / full) %: 24.8250


24.8249568939209

In [74]:
curr_circuit = [(layer, head) for layer in range(12) for head in range(12)]
mean_ablate_by_lst(curr_circuit, model, pureDataset, pureDataset_2, print_output=True).item()

Average logit difference (IOI dataset, using entire model): 4.6238
Average logit difference (IOI dataset, only using circuit): 4.6238
Average logit difference (circuit / full) %: 100.0000


100.0

In [89]:
def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": lst,
        # "name mover 3": lst,
        # "name mover 2": lst,
        # "name mover 1": lst,
        # "name mover 0": lst,
        # "number mover": [(7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover": lst,
        # "number mover 4": lst,
        "number mover 3": lst,
        "number mover 2": lst,
        "number mover 1": lst,
    }

    SEQ_POS_TO_KEEP = {
        # "name mover 4": "Eve",
        # "name mover 3": "Don",
        # "name mover 2": "Claire",
        # "name mover 1": "Bob",
        # "name mover 0": "Adam",
        "number mover": "end",
        # "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6, 4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)]
# curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)]
mean_ablate_by_lst(curr_circuit, model, pureDataset, pureDataset_2, print_output=True).item()

Average logit difference (IOI dataset, using entire model): 4.6238
Average logit difference (IOI dataset, only using circuit): 1.2746
Average logit difference (circuit / full) %: 27.5656


27.565563201904297

In [80]:
def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": lst,
        # "name mover 3": lst,
        # "name mover 2": lst,
        # "name mover 1": lst,
        # "name mover 0": lst,
        "number mover": [(7, 11), (8, 8), (8, 11), (9, 1)],
        # "number mover": lst,
        "number mover 4": lst,
        "number mover 3": lst,
        "number mover 2": lst,
        "number mover 1": lst,
    }

    SEQ_POS_TO_KEEP = {
        # "name mover 4": "Eve",
        # "name mover 3": "Don",
        # "name mover 2": "Claire",
        # "name mover 1": "Bob",
        # "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)]
mean_ablate_by_lst(curr_circuit, model, pureDataset, pureDataset_2, print_output=True).item()

Average logit difference (IOI dataset, using entire model): 4.6238
Average logit difference (IOI dataset, only using circuit): -0.2203
Average logit difference (circuit / full) %: -4.7644


-4.764449119567871

In [82]:
def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": lst,
        # "name mover 3": lst,
        # "name mover 2": lst,
        # "name mover 1": lst,
        # "name mover 0": lst,
        "number mover": [(7, 11), (8, 8), (8, 11), (9, 1)],
        # "number mover": lst,
        "number mover 4": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 3": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 2": lst,
        "number mover 1": lst,
    }

    SEQ_POS_TO_KEEP = {
        # "name mover 4": "Eve",
        # "name mover 3": "Don",
        # "name mover 2": "Claire",
        # "name mover 1": "Bob",
        # "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)]
mean_ablate_by_lst(curr_circuit, model, pureDataset, pureDataset_2, print_output=True).item()

Average logit difference (IOI dataset, using entire model): 4.6238
Average logit difference (IOI dataset, only using circuit): -0.2194
Average logit difference (circuit / full) %: -4.7443


-4.744296073913574

In [91]:
def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": lst,
        # "name mover 3": lst,
        # "name mover 2": lst,
        # "name mover 1": lst,
        # "name mover 0": lst,
        "number mover": [(7, 11), (8, 8), (8, 11), (9, 1)],
        # "number mover": lst,
            # recall num mover 4 has WRONG INDEX due to repeat getting 3rd instead of last pos; this means last pos just keeps num mover
        "number mover 4": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 3": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 2": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 1": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
    }

    SEQ_POS_TO_KEEP = {
        # "name mover 4": "Eve",
        # "name mover 3": "Don",
        # "name mover 2": "Claire",
        # "name mover 1": "Bob",
        # "name mover 0": "Adam",
        "number mover": "end",
        "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)]
mean_ablate_by_lst(curr_circuit, model, pureDataset, pureDataset_2, print_output=True).item()

Average logit difference (IOI dataset, using entire model): 4.6238
Average logit difference (IOI dataset, only using circuit): -0.2193
Average logit difference (circuit / full) %: -4.7424


-4.7424397468566895

In [93]:
def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": lst,
        # "name mover 3": lst,
        # "name mover 2": lst,
        # "name mover 1": lst,
        # "name mover 0": lst,
        "number mover": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        # "number mover": lst,
        # "number mover 4": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 3": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 2": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 1": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
    }

    SEQ_POS_TO_KEEP = {
        # "name mover 4": "Eve",
        # "name mover 3": "Don",
        # "name mover 2": "Claire",
        # "name mover 1": "Bob",
        # "name mover 0": "Adam",
        "number mover": "end",
        # "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)]
mean_ablate_by_lst(curr_circuit, model, pureDataset, pureDataset_2, print_output=True).item()

Average logit difference (IOI dataset, using entire model): 4.6238
Average logit difference (IOI dataset, only using circuit): 1.2746
Average logit difference (circuit / full) %: 27.5656


27.565563201904297

In [95]:
def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": lst,
        # "name mover 3": lst,
        # "name mover 2": lst,
        # "name mover 1": lst,
        # "name mover 0": lst,
        "number mover": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        # "number mover": lst,
        # "number mover 4": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 3": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 2": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)],
        "number mover 1": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)],
    }

    SEQ_POS_TO_KEEP = {
        # "name mover 4": "Eve",
        # "name mover 3": "Don",
        # "name mover 2": "Claire",
        # "name mover 1": "Bob",
        # "name mover 0": "Adam",
        "number mover": "end",
        # "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)]
mean_ablate_by_lst(curr_circuit, model, pureDataset, pureDataset_2, print_output=True).item()

Average logit difference (IOI dataset, using entire model): 4.6238
Average logit difference (IOI dataset, only using circuit): 1.2743
Average logit difference (circuit / full) %: 27.5606


27.56063461303711

In [96]:
def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": lst,
        # "name mover 3": lst,
        # "name mover 2": lst,
        # "name mover 1": lst,
        # "name mover 0": lst,
        "number mover": [(7, 11), (8, 8), (8, 11), (9, 1)],
        # "number mover": lst,
        # "number mover 4": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 3": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 2": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)],
        "number mover 1": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)],
    }

    SEQ_POS_TO_KEEP = {
        # "name mover 4": "Eve",
        # "name mover 3": "Don",
        # "name mover 2": "Claire",
        # "name mover 1": "Bob",
        # "name mover 0": "Adam",
        "number mover": "end",
        # "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)]
mean_ablate_by_lst(curr_circuit, model, pureDataset, pureDataset_2, print_output=True).item()

Average logit difference (IOI dataset, using entire model): 4.6238
Average logit difference (IOI dataset, only using circuit): -0.2194
Average logit difference (circuit / full) %: -4.7443


-4.744296073913574

So all the early heads are still impt for the end pos, unlike in among words

In [97]:
def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": lst,
        # "name mover 3": lst,
        # "name mover 2": lst,
        # "name mover 1": lst,
        # "name mover 0": lst,
        "number mover": [(6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        # "number mover": lst,
        # "number mover 4": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 3": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 2": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)],
        "number mover 1": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)],
    }

    SEQ_POS_TO_KEEP = {
        # "name mover 4": "Eve",
        # "name mover 3": "Don",
        # "name mover 2": "Claire",
        # "name mover 1": "Bob",
        # "name mover 0": "Adam",
        "number mover": "end",
        # "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)]
mean_ablate_by_lst(curr_circuit, model, pureDataset, pureDataset_2, print_output=True).item()

Average logit difference (IOI dataset, using entire model): 4.6238
Average logit difference (IOI dataset, only using circuit): -0.2020
Average logit difference (circuit / full) %: -4.3693


-4.369297504425049

In [100]:
def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": lst,
        # "name mover 3": lst,
        # "name mover 2": lst,
        # "name mover 1": lst,
        # "name mover 0": lst,
        "number mover": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (7, 11), (8, 8), (8, 11), (9, 1)],
        # "number mover": lst,
        # "number mover 4": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 3": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 2": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)],
        "number mover 1": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)],
    }

    SEQ_POS_TO_KEEP = {
        # "name mover 4": "Eve",
        # "name mover 3": "Don",
        # "name mover 2": "Claire",
        # "name mover 1": "Bob",
        # "name mover 0": "Adam",
        "number mover": "end",
        # "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)]
mean_ablate_by_lst(curr_circuit, model, pureDataset, pureDataset_2, print_output=True).item()

Average logit difference (IOI dataset, using entire model): 4.6238
Average logit difference (IOI dataset, only using circuit): 0.8827
Average logit difference (circuit / full) %: 19.0896


19.089550018310547

In [101]:
def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": lst,
        # "name mover 3": lst,
        # "name mover 2": lst,
        # "name mover 1": lst,
        # "name mover 0": lst,
        "number mover": [(0, 1), (0, 10), (1, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        # "number mover": lst,
        # "number mover 4": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 3": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 2": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)],
        "number mover 1": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)],
    }

    SEQ_POS_TO_KEEP = {
        # "name mover 4": "Eve",
        # "name mover 3": "Don",
        # "name mover 2": "Claire",
        # "name mover 1": "Bob",
        # "name mover 0": "Adam",
        "number mover": "end",
        # "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)]
mean_ablate_by_lst(curr_circuit, model, pureDataset, pureDataset_2, print_output=True).item()

Average logit difference (IOI dataset, using entire model): 4.6238
Average logit difference (IOI dataset, only using circuit): 0.3068
Average logit difference (circuit / full) %: 6.6353


6.63534688949585

In [102]:
def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": lst,
        # "name mover 3": lst,
        # "name mover 2": lst,
        # "name mover 1": lst,
        # "name mover 0": lst,
        "number mover": [(3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        # "number mover": lst,
        # "number mover 4": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 3": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 2": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)],
        "number mover 1": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)],
    }

    SEQ_POS_TO_KEEP = {
        # "name mover 4": "Eve",
        # "name mover 3": "Don",
        # "name mover 2": "Claire",
        # "name mover 1": "Bob",
        # "name mover 0": "Adam",
        "number mover": "end",
        # "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)]
mean_ablate_by_lst(curr_circuit, model, pureDataset, pureDataset_2, print_output=True).item()

Average logit difference (IOI dataset, using entire model): 4.6238
Average logit difference (IOI dataset, only using circuit): 0.5878
Average logit difference (circuit / full) %: 12.7116


12.7116117477417

In [19]:
def mean_ablate_by_lst(lst, model, dataset, dataset_2, print_output=True):
    CIRCUIT = {
        # "name mover 4": lst,
        # "name mover 3": lst,
        # "name mover 2": lst,
        # "name mover 1": lst,
        # "name mover 0": lst,
        "number mover": [(3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        # "number mover": lst,
        # "number mover 4": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        "number mover 3": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10), (7, 11), (8, 8), (8, 11), (9, 1)],
        # "number mover 2": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)],
        # "number mover 1": [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)],
    }

    SEQ_POS_TO_KEEP = {
        # "name mover 4": "Eve",
        # "name mover 3": "Don",
        # "name mover 2": "Claire",
        # "name mover 1": "Bob",
        # "name mover 0": "Adam",
        "number mover": "end",
        # "number mover 4": "S4",
        "number mover 3": "S3",
        "number mover 2": "S2",
        "number mover 1": "S1",
    }

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

    ioi_logits_original, ioi_cache = model.run_with_cache(dataset.toks)

    model = ioi_circuit_extraction.add_mean_ablation_hook(model, means_dataset=dataset_2, circuit=CIRCUIT, seq_pos_to_keep=SEQ_POS_TO_KEEP)
    ioi_logits_minimal = model(dataset.toks)

    orig_score = logits_to_ave_logit_diff_2(ioi_logits_original, dataset)
    new_score = logits_to_ave_logit_diff_2(ioi_logits_minimal, dataset)
    if print_output:
        print(f"Average logit difference (IOI dataset, using entire model): {orig_score:.4f}")
        print(f"Average logit difference (IOI dataset, only using circuit): {new_score:.4f}")
        print(f"Average logit difference (circuit / full) %: {100 * new_score / orig_score:.4f}")
    # return new_score
    return 100 * new_score / orig_score

curr_circuit = [(0, 1), (0, 10), (1, 5), (3, 0), (5, 5), (6, 2), (6,4), (6, 6), (6, 10)]
mean_ablate_by_lst(curr_circuit, model, pureDataset, pureDataset_2, print_output=True).item()

Average logit difference (IOI dataset, using entire model): 4.6238
Average logit difference (IOI dataset, only using circuit): 0.5761
Average logit difference (circuit / full) %: 12.4591


12.459147453308105