# Setup

In [None]:
save_files = True

In [None]:
%%capture
%pip install git+https://github.com/neelnanda-io/TransformerLens.git

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import einops
from fancy_einsum import einsum
import tqdm.notebook as tqdm
import random
from pathlib import Path
# import plotly.express as px
from torch.utils.data import DataLoader

from jaxtyping import Float, Int
from typing import List, Union, Optional
from functools import partial
import copy

import itertools
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
import dataclasses
import datasets
from IPython.display import HTML

import pickle
from google.colab import files

import matplotlib.pyplot as plt
import statistics

In [None]:
import transformer_lens
import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookedRootModule,
    HookPoint,
)  # Hooking utilities
from transformer_lens import HookedTransformer #, HookedTransformerConfig, FactoredMatrix, ActivationCache

We turn automatic differentiation off, to save GPU memory, as this notebook focuses on model inference not model training.

In [None]:
torch.set_grad_enabled(False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import pdb

## Import functions from repo

In [None]:
!git clone https://github.com/apartresearch/seqcont_circuits.git
%cd /content/seqcont_circuits/src/iter_node_pruning

Cloning into 'seqcont_circuits'...
remote: Enumerating objects: 1022, done.[K
remote: Counting objects: 100% (488/488), done.[K
remote: Compressing objects: 100% (287/287), done.[K
remote: Total 1022 (delta 296), reused 378 (delta 190), pack-reused 534[K
Receiving objects: 100% (1022/1022), 18.76 MiB | 14.06 MiB/s, done.
Resolving deltas: 100% (659/659), done.
/content/seqcont_circuits/src/iter_node_pruning


In [None]:
## comment this out when debugging functions in colab to use funcs defined in colab

# don't improt this
# # from dataset import Dataset

from metrics import *
from head_ablation_fns import *
from mlp_ablation_fns import *
from node_ablation_fns import *
from loop_node_ablation_fns import *

## fns

In [None]:
import random


In [None]:
class Dataset:
    def __init__(self, prompts, pos_dict, tokenizer):  # , S1_is_first=False
        self.prompts = prompts
        self.tokenizer = tokenizer
        self.N = len(prompts)
        self.max_len = max(
            [
                len(self.tokenizer(prompt["text"]).input_ids)
                for prompt in self.prompts
            ]
        )
        all_ids = [0 for prompt in self.prompts] # only 1 template
        all_ids_ar = np.array(all_ids)
        self.groups = []
        for id in list(set(all_ids)):
            self.groups.append(np.where(all_ids_ar == id)[0])

        texts = [ prompt["text"] for prompt in self.prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )
        self.corr_tokenIDs = [
            # self.tokenizer.encode(" " + prompt["corr"])[0] for prompt in self.prompts
            self.tokenizer.encode(prompt["corr"])[0] for prompt in self.prompts
        ]
        self.incorr_tokenIDs = [
            # self.tokenizer.encode(" " + prompt["incorr"])[0] for prompt in self.prompts
            self.tokenizer.encode(prompt["incorr"])[0] for prompt in self.prompts
        ]

        # word_idx: for every prompt, find the token index of each target token and "end"
        # word_idx is a dict whose values are tensor with an element for each prompt. The element is the targ token's ind at that prompt
        self.word_idx = {}
        # for targ in [key for key in self.prompts[0].keys() if (key != 'text' and key != 'corr' and key != 'incorr')]:
        for targ in [key for key in pos_dict]:
            targ_lst = []
            for prompt in self.prompts:
                input_text = prompt["text"]
                tokens = self.tokenizer.tokenize(input_text)
                # if S1_is_first and targ == "S1":  # only use this if first token doesn't have space Ġ in front
                #     target_token = prompt[targ]
                # else:
                #     target_token = "Ġ" + prompt[targ]
                # target_index = tokens.index(target_token)
                target_index = pos_dict[targ]
                targ_lst.append(target_index)
            self.word_idx[targ] = torch.tensor(targ_lst)

        targ_lst = []
        for prompt in self.prompts:
            input_text = prompt["text"]
            tokens = self.tokenizer.tokenize(input_text)
            end_token_index = len(tokens) - 1
            targ_lst.append(end_token_index)
        self.word_idx["end"] = torch.tensor(targ_lst)

    def __len__(self):
        return self.N

In [None]:
def generate_prompts_list_longer(text, tokens):
    prompts_list = []
    prompt_dict = {
        'corr': str(1),
        'incorr': str(2),
        'text': text
        # 'text': model.to_string(tokens)[0]
        }
    tokens_as_strs = model.tokenizer.tokenize(text)
    # tokens_as_strs = model.to_string(tokens)[0].split()
    # for i in range(tokens.shape[1]):
    for i, tok in enumerate(tokens_as_strs):
        prompt_dict['S'+str(i)] = tok
    # for i, tok in enumerate(tokens):
    #     prompt_dict['S'+str(i)] = model.to_string(tok)

    # prompt_dict = {
    #     'corr': '4',
    #     'incorr': '3',
    #     'text': model.to_string(tokens)[0]
    # }
    # # list_tokens = tokenizer.tokenize('1 2 3 ')
    # tokens_as_strs = model.to_string(tokens)[0].split()
    # for i, tok_as_str in enumerate(tokens_as_strs):
    #     if tok_as_str == '▁':
    #         prompt_dict['S'+str(i)] = ' '
    #     else:
    #         prompt_dict['S'+str(i)] = tok_as_str
    prompts_list.append(prompt_dict)
    return prompts_list

# Load Model

In [None]:
from transformers import LlamaForCausalLM, LlamaTokenizer

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
LLAMA_2_7B_CHAT_PATH = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = LlamaTokenizer.from_pretrained(LLAMA_2_7B_CHAT_PATH)
# tokenizer = LlamaTokenizer.from_pretrained(LLAMA_2_7B_CHAT_PATH, use_fast= False, add_prefix_space= False)
hf_model = LlamaForCausalLM.from_pretrained(LLAMA_2_7B_CHAT_PATH, low_cpu_mem_usage=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [None]:
import transformer_lens.utils as utils
from transformer_lens.hook_points import HookPoint
from transformer_lens import HookedTransformer

In [None]:
model = HookedTransformer.from_pretrained(
    LLAMA_2_7B_CHAT_PATH,
    hf_model = hf_model,
    tokenizer = tokenizer,
    device = "cpu",
    fold_ln = False,
    center_writing_weights = False,
    center_unembed = False,
)

del hf_model

model = model.to("cuda" if torch.cuda.is_available() else "cpu")

Loaded pretrained model meta-llama/Llama-2-7b-chat-hf into HookedTransformer
Moving model to device:  cuda


# new ablation functions

In [None]:
def get_heads_actv_mean(
    means_dataset: Dataset,
    model: HookedTransformer
) -> Float[Tensor, "layer batch seq head_idx d_head"]:
    '''
    Output: The mean activations of a head's output
    '''
    _, means_cache = model.run_with_cache(
        means_dataset.toks.long(),
        return_type=None,
        names_filter=lambda name: name.endswith("z"),
    )
    n_layers, n_heads, d_head = model.cfg.n_layers, model.cfg.n_heads, model.cfg.d_head
    batch, seq_len = len(means_dataset), means_dataset.max_len
    means = t.zeros(size=(n_layers, batch, seq_len, n_heads, d_head), device=model.cfg.device)

    # for layer in range(model.cfg.n_layers):
    #     z_for_this_layer: Float[Tensor, "batch seq head d_head"] = means_cache[utils.get_act_name("z", layer)]
    #     for template_group in means_dataset.groups:
    #         z_for_this_template = z_for_this_layer[template_group]
    #         z_means_for_this_template = einops.reduce(z_for_this_template, "batch seq head d_head -> seq head d_head", "mean")
    #         if z_means_for_this_template.shape[0] == 5:
    #             pdb.set_trace()
    #         means[layer, template_group] = z_means_for_this_template

    del(means_cache)

    return means

In [None]:
# def mask_circ_heads(
#     means_dataset: Dataset,
#     model: HookedTransformer,
#     circuit: Dict[str, List[Tuple[int, int]]],
#     seq_pos_to_keep: Dict[str, str],
# ) -> Dict[int, Bool[Tensor, "batch seq head"]]:
#     '''
#     Output: for each layer, a mask of circuit components that should not be ablated
#     '''
#     heads_and_posns_to_keep = {}
#     batch, seq, n_heads = len(means_dataset), means_dataset.max_len, model.cfg.n_heads

#     for layer in range(model.cfg.n_layers):

#         mask = t.zeros(size=(batch, seq, n_heads))

#         for (head_type, head_list) in circuit.items():
#             seq_pos = seq_pos_to_keep[head_type]
#             # if seq_pos == 'S7':
#             #     pdb.set_trace()
#             indices = means_dataset.word_idx[seq_pos] # modify this for key vs query pos. curr, this is query
#             for (layer_idx, head_idx) in head_list:
#                 if layer_idx == layer:
#                     # if indices.item() == 7:
#                     #     pdb.set_trace()
#                     mask[:, indices, head_idx] = 1
#                     # mask[:, :, head_idx] = 1  # keep L.H at all pos

#         heads_and_posns_to_keep[layer] = mask.bool()
#     # pdb.set_trace()
#     return heads_and_posns_to_keep

In [None]:
def mask_circ_heads(
    means_dataset: Dataset,
    model: HookedTransformer,
    circuit: Dict[str, List[Tuple[int, int]]],
    seq_pos_to_keep: Dict[str, str],
) -> Dict[int, Bool[Tensor, "batch seq head"]]:
    '''
    Output: for each layer, a mask of circuit components that should not be ablated
    '''
    heads_and_posns_to_keep = {}
    # batch, seq, n_heads = len(means_dataset), means_dataset.max_len, model.cfg.n_heads
    batch, seq, n_heads = len(means_dataset), len(circuit.keys()), model.cfg.n_heads
    # print(seq)

    for layer in range(model.cfg.n_layers):

        mask = t.zeros(size=(batch, seq, n_heads))

        for (head_type, head_list) in circuit.items():
            seq_pos = seq_pos_to_keep[head_type]
            indices = means_dataset.word_idx[seq_pos] # modify this for key vs query pos. curr, this is query
            for (layer_idx, head_idx) in head_list:
                if layer_idx == layer:
                    # mask[:, indices, head_idx] = 1
                    mask[:, :, head_idx] = 1

        heads_and_posns_to_keep[layer] = mask.bool()

    return heads_and_posns_to_keep

In [None]:
def hook_func_mask_head(
    z: Float[Tensor, "batch seq head d_head"],
    hook: HookPoint,
    # components_to_keep: Dict[int, Bool[Tensor, "batch seq head"]],
    # means: Float[Tensor, "layer batch seq head d_head"],
    circuit: Dict[str, List[Tuple[int, int]]],
) -> Float[Tensor, "batch seq head d_head"]:
    '''
    Use this to not mask components
    '''
    # mask_for_this_layer = components_to_keep[hook.layer()].unsqueeze(-1).to(z.device)
    # z = t.where(mask_for_this_layer, z, means[hook.layer()])

    ###
    # heads_and_posns_to_keep = {}
    # batch, seq, n_heads = z.shape[0], z.shape[1], model.cfg.n_heads  # components_to_keep[0].shape[0] is batch

    # for layer in range(model.cfg.n_layers):

    #     mask = t.zeros(size=(batch, seq, n_heads))

    #     for (head_type, head_list) in circuit.items():
    #         # seq_pos = seq_pos_to_keep[head_type]
    #         # indices = means_dataset.word_idx[seq_pos] # modify this for key vs query pos. curr, this is query
    #         for (layer_idx, head_idx) in head_list:
    #             if layer_idx == layer:
    #                 # mask[:, indices, head_idx] = 1
    #                 mask[:, :, head_idx] = 1

    #     heads_and_posns_to_keep[layer] = mask.bool()
    ###
    mask_for_this_layer = t.zeros(size=(z.shape[0], z.shape[1], z.shape[2]))
    for (head_type, head_list) in circuit.items():
        # seq_pos = seq_pos_to_keep[head_type]
        # indices = means_dataset.word_idx[seq_pos] # modify this for key vs query pos. curr, this is query
        for (layer_idx, head_idx) in head_list:
            if layer_idx == hook.layer():
                # mask[:, indices, head_idx] = 1
                mask_for_this_layer[:, :, head_idx] = 1

    mask_for_this_layer = mask_for_this_layer.bool()
    mask_for_this_layer = mask_for_this_layer.unsqueeze(-1).to(z.device)  # d_model is 1; then is broadcast in where

    z = t.where(mask_for_this_layer, z, 0)

    return z

In [None]:
def add_ablation_hook_head(
    model: HookedTransformer,
    means_dataset: Dataset,
    circuit: Dict[str, List[Tuple[int, int]]],
    seq_pos_to_keep: Dict[str, str],
    is_permanent: bool = True,
) -> HookedTransformer:
    '''
    Ablate the model, except as components and positions to keep
    '''

    model.reset_hooks(including_permanent=True)
    means = get_heads_actv_mean(means_dataset, model)
    components_to_keep = mask_circ_heads(means_dataset, model, circuit, seq_pos_to_keep)

    hook_fn = partial(
        hook_func_mask_head,
        # components_to_keep=components_to_keep,
        # means=means,
        circuit=circuit,
    )

    model.add_hook(lambda name: name.endswith("z"), hook_fn, is_permanent=is_permanent)
    return model

In [None]:
# from dataset import Dataset
from transformer_lens import HookedTransformer, utils
from transformer_lens.hook_points import HookPoint
import einops
from functools import partial
import torch as t
from torch import Tensor
from typing import Dict, Tuple, List
from jaxtyping import Float, Bool

# from head_ablation_fns import *
# from mlp_ablation_fns import *

def add_ablation_hook_MLP_head(
    model: HookedTransformer,
    means_dataset: Dataset,
    heads_lst, mlp_lst,
    is_permanent: bool = True,
) -> HookedTransformer:
    CIRCUIT = {}
    SEQ_POS_TO_KEEP = {}
    # for i in range(len(model.tokenizer.tokenize(means_dataset.prompts[0]['text']))):
    num_pos = len(model.tokenizer(means_dataset.prompts[0]['text']).input_ids)
    for i in range(num_pos ):
        CIRCUIT['S'+str(i)] = heads_lst
        # if i == len(model.tokenizer.tokenize(means_dataset.prompts[0]['text'])) - 1:
        # if i == num_pos - 1:
        #     SEQ_POS_TO_KEEP['S'+str(i)] = 'end'
        # else:
        SEQ_POS_TO_KEEP['S'+str(i)] = 'S'+str(i)

    model.reset_hooks(including_permanent=True)

    # Compute the mean of each head's output on the ABC dataset, grouped by template
    means = get_heads_actv_mean(means_dataset, model)
    # Convert this into a boolean map
    components_to_keep = mask_circ_heads(means_dataset, model, CIRCUIT, SEQ_POS_TO_KEEP)

    # Get a hook function which will patch in the mean z values for each head, at
    # all positions which aren't important for the circuit
    hook_fn = partial(
        hook_func_mask_head,
        # components_to_keep=components_to_keep,
        # means=means,
        circuit=CIRCUIT,
    )

    # Apply hook
    model.add_hook(lambda name: name.endswith("z"), hook_fn, is_permanent=is_permanent)

    # if all_entries_true(components_to_keep) == False:
    #     pdb.set_trace()
    ########################
    # CIRCUIT = {}
    # SEQ_POS_TO_KEEP = {}
    # # for i in range(len(model.tokenizer.tokenize(means_dataset.prompts[0]['text']))):
    # num_pos = len(model.tokenizer(means_dataset.prompts[0]['text']).input_ids)
    # for i in range(num_pos ):
    #     CIRCUIT['S'+str(i)] = mlp_lst
    #     # if i == len(model.tokenizer.tokenize(means_dataset.prompts[0]['text'])) - 1:
    #     # if i == num_pos - 1:
    #     #     SEQ_POS_TO_KEEP['S'+str(i)] = 'end'
    #     # else:
    #     SEQ_POS_TO_KEEP['S'+str(i)] = 'S'+str(i)

    # # Compute the mean of each head's output on the ABC dataset, grouped by template
    # means = get_MLPs_actv_mean(means_dataset, model)

    # # Convert this into a boolean map
    # components_to_keep = mask_circ_MLPs(means_dataset, model, CIRCUIT, SEQ_POS_TO_KEEP)

    # # Get a hook function which will patch in the mean z values for each head, at
    # # all positions which aren't important for the circuit
    # hook_fn = partial(
    #     hook_func_mask_mlp_out,
    #     components_to_keep=components_to_keep,
    #     means=means
    # )

    # model.add_hook(lambda name: name.endswith("mlp_out"), hook_fn, is_permanent=True)

    return model

In [None]:
def all_entries_true(tensor_dict):
    for key, tensor in tensor_dict.items():
        if not torch.all(tensor).item():
            return False
    return True

# ablation fns mult tok answers

In [None]:
def clean_gen(model, clean_text, corr_ans):
    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook
    tokens = model.to_tokens(clean_text).to(device)
    # tokens = tokens[:, 1:] # get rid of prepend bos when using model.to_tokens

    total_score = 0
    corr_ans_tokLen = 0
    ans_so_far = ''
    # while True:
    for i in range(5):
        # print(f"Sequence so far: {model.to_string(tokens)[0]!r}")
        logits = model(tokens)
        next_token = logits[0, -1].argmax(dim=-1) # Get the predicted token at the end of our sequence
        next_char = model.to_string(next_token)

        corr_logits = logits[:, -1, next_token]
        total_score += corr_logits
        # print(f"logit diff of new char: {corr_logits}")

        ans_so_far += next_char
        corr_ans_tokLen += 1
        # print(f"{tokens.shape[-1]+1}th char = {next_char!r}")
        if ans_so_far == corr_ans:
            # print('\nTotal logit diff: ', total_score.item())
            break

        # Define new input sequence, by appending the previously generated token
        tokens = torch.cat([tokens, next_token[None, None]], dim=-1)
        # if next_char == '':
        #     next_char = ' '
        # clean_text = clean_text + next_char
        # tokens = model.to_tokens(clean_text).to(device)
    return corr_ans_tokLen

In [None]:
def ablate_then_gen(model, clean_text, corr_text, heads_not_ablate, mlps_not_ablate, corr_ans_tokLen):
    tokens = model.to_tokens(clean_text).to(device)
    prompts_list = generate_prompts_list_longer(clean_text, tokens)

    corr_tokens = model.to_tokens(corr_text).to(device)
    prompts_list_2 = generate_prompts_list_longer(corr_text, corr_tokens)

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook
    pos_dict = {}
    num_pos = len(model.tokenizer(prompts_list_2[0]['text']).input_ids)
    for i in range(num_pos ):
        pos_dict['S'+str(i)] = i
    dataset_2 = Dataset(prompts_list_2, pos_dict, model.tokenizer)
    model = add_ablation_hook_MLP_head(model, dataset_2, heads_not_ablate, mlps_not_ablate)

    logits = model(tokens)
    next_token = logits[0, -1].argmax(dim=-1)
    next_char = model.to_string(next_token)

    total_score = 0

    for i in range(corr_ans_tokLen):
        if next_char == '':
            next_char = ' '

        clean_text = clean_text + next_char
        if i == corr_ans_tokLen - 1:
            print(model.to_string(tokens))
            # print(f"Sequence so far: {clean_text}")
            # print(f"{tokens.shape[-1]+1}th char = {next_char!r}")

        tokens = torch.cat([tokens, next_token[None, None]], dim=-1)

        # get new ablation dataset
        # model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

        # corr_text = corr_text + next_char
        # corr_tokens = torch.cat([corr_tokens, next_token[None, None]], dim=-1)
        # prompts_list_2 = generate_prompts_list_longer(corr_text, corr_tokens)

        # pos_dict = {}
        # num_pos = len(model.tokenizer(prompts_list_2[0]['text']).input_ids)
        # for i in range(num_pos ):
        #     pos_dict['S'+str(i)] = i

        # dataset_2 = Dataset(prompts_list_2, pos_dict, model.tokenizer, corr_tokens)

        # model = add_ablation_hook_MLP_head(model, dataset_2, heads_not_ablate, mlps_not_ablate)

        logits = model(tokens)
        next_token = logits[0, -1].argmax(dim=-1) # Get the predicted token at the end of our sequence
        next_char = model.to_string(next_token)

        # new_score = get_logit_diff(logits, dataset)
        # total_score += new_score
        # print(f"corr logit of new char: {new_score}")
    # print('\n Total corr logit: ', total_score.item())

In [None]:
# Function to randomly choose 50 pairs ensuring less than 10 overlap with heads_of_circ
def choose_heads_to_remove(filtered_pairs, heads_of_circ, num_pairs=50, max_overlap=10):
    while True:
        head_to_remove = random.sample(filtered_pairs, num_pairs)
        overlap_count = len([head for head in head_to_remove if head in heads_of_circ])
        if overlap_count < max_overlap:
            return head_to_remove

In [None]:
def ablate_auto_score(model, clean_text, corr_text, heads_not_ablate, mlps_not_ablate, correct_ans_tokLen):  # correct_ans
    tokens = model.to_tokens(clean_text).to(device)
    prompts_list = generate_prompts_list_longer(clean_text, tokens)

    corr_tokens = model.to_tokens(corr_text).to(device)
    prompts_list_2 = generate_prompts_list_longer(corr_text, corr_tokens)

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook
    pos_dict = {}
    num_pos = len(model.tokenizer(prompts_list_2[0]['text']).input_ids)
    for i in range(num_pos ):
        pos_dict['S'+str(i)] = i
    dataset_2 = Dataset(prompts_list_2, pos_dict, model.tokenizer)
    model = add_ablation_hook_MLP_head(model, dataset_2, heads_not_ablate, mlps_not_ablate)

    # logits = model(tokens)
    # next_token = logits[0, -1].argmax(dim=-1)
    # next_char = model.to_string(next_token)

    total_score = 0
    ans_so_far = ''
    for i in range(correct_ans_tokLen):
        # if next_char == '':
        #     next_char = ' '

        # clean_text = clean_text + next_char
        # if i == correct_ans_tokLen - 1:
        #     print(model.to_string(tokens))
        #     # print(f"Sequence so far: {clean_text}")
        #     # print(f"{tokens.shape[-1]+1}th char = {next_char!r}")

        # tokens = torch.cat([tokens, next_token[None, None]], dim=-1)

        # get new ablation dataset
        # model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

        # corr_text = corr_text + next_char
        # corr_tokens = torch.cat([corr_tokens, next_token[None, None]], dim=-1)
        # prompts_list_2 = generate_prompts_list_longer(corr_text, corr_tokens)

        # pos_dict = {}
        # num_pos = len(model.tokenizer(prompts_list_2[0]['text']).input_ids)
        # for i in range(num_pos ):
        #     pos_dict['S'+str(i)] = i

        # dataset_2 = Dataset(prompts_list_2, pos_dict, model.tokenizer, corr_tokens)

        # model = add_ablation_hook_MLP_head(model, dataset_2, heads_not_ablate, mlps_not_ablate)

        logits = model(tokens)
        next_token = logits[0, -1].argmax(dim=-1) # Get the predicted token at the end of our sequence
        next_char = model.to_string(next_token)

        if next_char == '':
            next_char = ' '

        clean_text = clean_text + next_char
        # if i == correct_ans_tokLen - 1:
            # print(model.to_string(tokens))
            # print(f"Sequence so far: {clean_text}")
            # print(f"{tokens.shape[-1]+1}th char = {next_char!r}")

        tokens = torch.cat([tokens, next_token[None, None]], dim=-1)

        ans_so_far += next_char
        correct_ans_tokLen += 1
        # print(f"{tokens.shape[-1]+1}th char = {next_char!r}")
    # if ans_so_far == corr_ans:
        # print('\nTotal logit diff: ', total_score.item())
    return ans_so_far

        # new_score = get_logit_diff(logits, dataset)
        # total_score += new_score
        # print(f"corr logit of new char: {new_score}")
    # print('\n Total corr logit: ', total_score.item())

# Define circs

In [None]:
# from Llama2_numerals_1to10.ipynb
nums_1to9 = [(0, 2), (0, 5), (0, 6), (0, 15), (1, 15), (1, 28), (2, 13), (2, 24), (3, 24), (4, 3), (4, 16), (5, 11), (5, 13), (5, 15), (5, 16), (5, 23), (5, 25), (5, 27), (6, 11), (6, 14), (6, 20), (6, 23), (6, 24), (6, 26), (6, 28), (6, 30), (6, 31), (7, 0), (7, 13), (7, 21), (7, 30), (8, 0), (8, 2), (8, 12), (8, 15), (8, 26), (8, 27), (8, 30), (8, 31), (9, 15), (9, 16), (9, 23), (9, 26), (9, 27), (9, 29), (9, 31), (10, 1), (10, 13), (10, 18), (10, 23), (10, 29), (11, 7), (11, 8), (11, 9), (11, 17), (11, 18), (11, 25), (11, 28), (12, 18), (12, 19), (12, 23), (12, 27), (13, 6), (13, 11), (13, 20), (14, 18), (14, 19), (14, 20), (14, 21), (16, 0), (18, 19), (18, 21), (18, 25), (18, 26), (18, 31), (19, 28), (20, 17), (21, 0), (21, 2), (22, 18), (22, 20), (22, 25), (23, 27), (26, 2)]
len(nums_1to9)

84

In [None]:
# nw_circ = [(0, 1), (0, 4), (0, 6), (0, 7), (0, 8), (0, 10), (0, 11), (0, 12), (1, 16), (1, 24), (1, 27), (1, 28), (2, 2), (2, 5), (2, 8), (2, 24), (2, 30), (3, 7), (3, 14), (3, 19), (3, 23), (4, 3), (5, 16), (5, 25), (6, 11), (6, 14), (7, 0), (7, 30), (8, 0), (8, 2), (8, 3), (8, 4), (8, 6), (8, 21), (8, 31), (9, 1), (9, 3), (9, 7), (9, 11), (9, 29), (9, 31), (10, 13), (10, 18), (10, 23), (10, 24), (10, 25), (10, 27), (11, 18), (11, 28), (12, 18), (12, 26), (13, 11), (13, 17), (13, 18), (13, 19), (13, 20), (13, 21), (13, 23), (14, 7), (14, 14), (15, 25), (15, 28), (16, 0), (16, 12), (16, 14), (16, 15), (16, 16), (16, 19), (16, 24), (16, 29), (17, 17), (17, 23), (17, 31), (18, 31), (19, 12), (20, 17), (27, 20), (27, 25), (27, 27), (27, 31), (28, 5), (29, 5)]
# in order from most impt to least based on how much changes perf when ablated
nw_circ = [(20, 17), (5, 25), (16, 0), (29, 5), (3, 19), (6, 11), (15, 25), (8, 0), (16, 24), (8, 4), (7, 0), (6, 14), (16, 29), (5, 16), (12, 26), (4, 3), (3, 7), (7, 30), (11, 28), (28, 5), (17, 31), (13, 11), (13, 20), (12, 18), (1, 27), (10, 13), (18, 31), (8, 6), (9, 1), (0, 4), (2, 2), (9, 11), (19, 12), (1, 16), (13, 17), (9, 7), (11, 18), (2, 24), (10, 18), (9, 31), (9, 29), (2, 30), (2, 5), (1, 24), (2, 8), (15, 28), (27, 31), (16, 14), (3, 23), (3, 14), (10, 23), (27, 20), (8, 3), (14, 7), (14, 14), (16, 15), (8, 2), (17, 17), (0, 1), (10, 27), (16, 19), (0, 8), (0, 12), (1, 28), (0, 11), (17, 23), (0, 10), (0, 6), (13, 19), (8, 31), (10, 24), (16, 12), (13, 23), (13, 21), (27, 27), (9, 3), (27, 25), (16, 16), (8, 21), (0, 7), (13, 18), (10, 25)]
len(nw_circ)

82

In [None]:
# impt_months_heads = ([(23, 17), (17, 11), (16, 0), (26, 14), (18, 9), (5, 25), (22, 20), (6, 24), (26, 9), (12, 18), (13, 20), (19, 12), (27, 29), (13, 14), (16, 14), (12, 26), (19, 30), (16, 18), (31, 27), (26, 28), (16, 1), (18, 1), (19, 28), (18, 31), (29, 4), (17, 0), (14, 1), (17, 12), (12, 15), (28, 16), (10, 1), (16, 19), (9, 27), (30, 1), (19, 27), (0, 3), (15, 11), (21, 3), (11, 19), (12, 0), (23, 11), (8, 14), (16, 8), (22, 13), (13, 3), (4, 19), (14, 15), (12, 20), (19, 16), (18, 5)])
months_circ = [(20, 17), (6, 11), (16, 0), (5, 15), (17, 11), (23, 16), (5, 25), (7, 0), (26, 14), (6, 14), (12, 22), (8, 4), (12, 15), (16, 29), (15, 25), (5, 16), (18, 31), (14, 7), (11, 18), (4, 12), (3, 19), (12, 2), (11, 28), (4, 3), (18, 9), (8, 14), (12, 3), (11, 2), (10, 13), (4, 16), (1, 22), (11, 16), (3, 15), (13, 31), (2, 4), (2, 16), (8, 13), (0, 13), (8, 15), (12, 28), (1, 5), (0, 4), (0, 25), (3, 24), (13, 11), (1, 24), (8, 16), (13, 8), (3, 26), (0, 6), (3, 23), (1, 3), (14, 3), (8, 19), (8, 12), (14, 2), (8, 5), (1, 28), (8, 20), (2, 30), (8, 6), (10, 1), (13, 20), (19, 27)]
len(months_circ)

64

In [None]:
intersect_all = list(set(nums_1to9) & set(nw_circ) & set(months_circ))
len(intersect_all)

16

In [None]:
union_all = list(set(nums_1to9) | set(nw_circ) | set(months_circ))
len(union_all)

172

# auto measure fns

In [None]:
def ablate_circ_autoScore(model, circuit, sequences_as_str, next_members):
    corr_text = "5 3 9"
    list_outputs = []
    score = 0
    for clean_text, correct_ans in zip(sequences_as_str, next_members):
        correct_ans_tokLen = clean_gen(model, clean_text, correct_ans)

        heads_not_ablate = [(layer, head) for layer in range(32) for head in range(32)]  # unablated
        head_to_remove = circuit
        heads_not_ablate = [x for x in heads_not_ablate if (x not in head_to_remove)]

        mlps_not_ablate = [layer for layer in range(32)]

        output_after_ablate = ablate_auto_score(model, clean_text, corr_text, heads_not_ablate, mlps_not_ablate, correct_ans_tokLen)
        list_outputs.append(output_after_ablate)
        print(correct_ans, output_after_ablate)
        if correct_ans == output_after_ablate:
            score += 1
    perc_score = score / len(next_members)
    return perc_score, list_outputs

In [None]:
def ablate_randcirc_autoScore(model, sequences_as_str, next_members, num_rand_runs, heads_not_overlap, num_heads_rand, num_not_overlap):
    corr_text = "5 3 9"
    list_outputs = []
    all_scores = []
    for clean_text, correct_ans in zip(sequences_as_str, next_members):
        prompt_score = 0
        correct_ans_tokLen = clean_gen(model, clean_text, correct_ans)
        for j in range(num_rand_runs):
            all_possible_pairs =  [(layer, head) for layer in range(32) for head in range(32)]
            filtered_pairs = [pair for pair in all_possible_pairs if pair not in heads_not_overlap] # Filter out heads_not_overlap from all_possible_pairs

            # Randomly choose num_heads_rand pairs ensuring less than num_not_overlap overlaps with heads_not_overlap
            head_to_remove = choose_heads_to_remove(filtered_pairs, heads_not_overlap, num_heads_rand, num_not_overlap)

            heads_not_ablate = [x for x in all_possible_pairs if x not in head_to_remove]

            mlps_not_ablate = [layer for layer in range(32)]

            output_after_ablate = ablate_auto_score(model, clean_text, corr_text, heads_not_ablate, mlps_not_ablate, correct_ans_tokLen)
            # list_outputs.append(output_after_ablate)
            # print(correct_ans, output_after_ablate)
            if correct_ans == output_after_ablate:
                prompt_score += 1
        print(prompt_score / num_rand_runs)
        all_scores.append(prompt_score / num_rand_runs)

    perc_score = sum(all_scores) / len(next_members)
    return perc_score, list_outputs

In [None]:
# import random

# def gen_single_addition_prompts(num_prompts):
#     sequences = []
#     next_members = []

#     single_digit_additions = [(random.randint(0, 9), random.randint(0, 9)) for _ in range(num_prompts)]
#     # double_digit_additions = [(random.randint(10, 99), random.randint(10, 99)) for _ in range(num_prompts)]

#     for a, b in single_digit_additions:
#         prompt = f"{a} + {b} = "
#         answer = str(a + b)
#         sequences.append(prompt)
#         next_members.append(answer)

#     # for a, b in double_digit_additions:
#     #     prompt = f"{a} + {b} = "
#     #     answer = str(a + b)
#     #     sequences.append(prompt)
#     #     next_members.append(answer)

#     print("Sequences:")
#     print(sequences)
#     print("\nNext Members:")
#     print(next_members)
#     return sequences, next_members

import random

def gen_single_addition_prompts(num_prompts):
    sequences = []
    next_members = []
    seen_pairs = set()

    while len(sequences) < num_prompts:
        a = random.randint(0, 9)
        b = random.randint(0, 9)
        if (a, b) not in seen_pairs:
            seen_pairs.add((a, b))
            prompt = f"{a} + {b} = "
            answer = str(a + b)
            sequences.append(prompt)
            next_members.append(answer)

    print("Sequences:")
    print(sequences)
    print("\nNext Members:")
    print(next_members)
    return sequences, next_members

In [None]:
# import random

# def gen_double_addition_prompts(num_prompts):
#     sequences = []
#     next_members = []

#     # single_digit_additions = [(random.randint(0, 9), random.randint(0, 9)) for _ in range(num_prompts)]
#     double_digit_additions = [(random.randint(10, 99), random.randint(10, 99)) for _ in range(num_prompts)]

#     # for a, b in single_digit_additions:
#     #     prompt = f"{a} + {b} = "
#     #     answer = str(a + b)
#     #     sequences.append(prompt)
#     #     next_members.append(answer)

#     for a, b in double_digit_additions:
#         prompt = f"{a} + {b} = "
#         answer = str(a + b)
#         sequences.append(prompt)
#         next_members.append(answer)

#     print("Sequences:")
#     print(sequences)
#     print("\nNext Members:")
#     print(next_members)
#     return sequences, next_members

import random

def gen_double_addition_prompts(num_prompts):
    sequences = []
    next_members = []
    seen_pairs = set()

    while len(sequences) < num_prompts:
        a = random.randint(10, 99)
        b = random.randint(10, 99)
        if (a, b) not in seen_pairs:
            seen_pairs.add((a, b))
            prompt = f"{a} + {b} = "
            answer = str(a + b)
            sequences.append(prompt)
            next_members.append(answer)

    print("Sequences:")
    print(sequences)
    print("\nNext Members:")
    print(next_members)
    return sequences, next_members

In [None]:
# import random

# def gen_s_subtraction_prompts(num_prompts):
#     sequences = []
#     next_members = []

#     single_digit_subtractions = [(random.randint(0, 9), random.randint(0, 9)) for _ in range(num_prompts)]
#     # double_digit_subtractions = [(random.randint(10, 99), random.randint(10, 99)) for _ in range(num_prompts)]

#     for a, b in single_digit_subtractions:
#         # Ensure a is greater than or equal to b to avoid negative results
#         a, b = max(a, b), min(a, b)
#         prompt = f"{a} - {b} = "
#         answer = str(a - b)
#         sequences.append(prompt)
#         next_members.append(answer)

#     # for a, b in double_digit_subtractions:
#     #     # Ensure a is greater than or equal to b to avoid negative results
#     #     a, b = max(a, b), min(a, b)
#     #     prompt = f"{a} - {b} = "
#     #     answer = str(a - b)
#     #     sequences.append(prompt)
#     #     next_members.append(answer)

#     print("Sequences:")
#     print(sequences)
#     print("\nNext Members:")
#     print(next_members)
#     return sequences, next_members

import random

def gen_s_subtraction_prompts(num_prompts):
    sequences = []
    next_members = []
    seen_pairs = set()

    while len(sequences) < num_prompts:
        a = random.randint(0, 9)
        b = random.randint(0, 9)
        a, b = max(a, b), min(a, b)  # Ensure a is greater than or equal to b to avoid negative results
        if (a, b) not in seen_pairs:
            seen_pairs.add((a, b))
            prompt = f"{a} - {b} = "
            answer = str(a - b)
            sequences.append(prompt)
            next_members.append(answer)

    print("Sequences:")
    print(sequences)
    print("\nNext Members:")
    print(next_members)
    return sequences, next_members

In [None]:
# import random

# def gen_d_subtraction_prompts(num_prompts):
#     sequences = []
#     next_members = []

#     # single_digit_subtractions = [(random.randint(0, 9), random.randint(0, 9)) for _ in range(num_prompts)]
#     double_digit_subtractions = [(random.randint(10, 99), random.randint(10, 99)) for _ in range(num_prompts)]

#     # for a, b in single_digit_subtractions:
#     #     # Ensure a is greater than or equal to b to avoid negative results
#     #     a, b = max(a, b), min(a, b)
#     #     prompt = f"{a} - {b} = "
#     #     answer = str(a - b)
#     #     sequences.append(prompt)
#     #     next_members.append(answer)

#     for a, b in double_digit_subtractions:
#         # Ensure a is greater than or equal to b to avoid negative results
#         a, b = max(a, b), min(a, b)
#         prompt = f"{a} - {b} = "
#         answer = str(a - b)
#         sequences.append(prompt)
#         next_members.append(answer)

#     print("Sequences:")
#     print(sequences)
#     print("\nNext Members:")
#     print(next_members)
#     return sequences, next_members

import random

def gen_d_subtraction_prompts(num_prompts):
    sequences = []
    next_members = []
    seen_pairs = set()

    while len(sequences) < num_prompts:
        a = random.randint(10, 99)
        b = random.randint(10, 99)
        a, b = max(a, b), min(a, b)  # Ensure a is greater than or equal to b to avoid negative results
        if (a, b) not in seen_pairs:
            seen_pairs.add((a, b))
            prompt = f"{a} - {b} = "
            answer = str(a - b)
            sequences.append(prompt)
            next_members.append(answer)

    print("Sequences:")
    print(sequences)
    print("\nNext Members:")
    print(next_members)
    return sequences, next_members


# addition

In [None]:
num_prompts = 50
sequences_as_str, next_members = gen_addition_prompts(num_prompts)

Sequences:
['1 + 1 = ', '9 + 9 = ', '7 + 7 = ', '1 + 9 = ', '4 + 1 = ', '6 + 0 = ', '4 + 0 = ', '0 + 4 = ', '3 + 3 = ', '7 + 7 = ', '1 + 9 = ', '8 + 3 = ', '3 + 2 = ', '5 + 1 = ', '2 + 4 = ', '2 + 5 = ', '1 + 6 = ', '8 + 2 = ', '8 + 0 = ', '9 + 9 = ', '9 + 8 = ', '8 + 9 = ', '2 + 9 = ', '7 + 1 = ', '2 + 9 = ', '5 + 1 = ', '5 + 5 = ', '6 + 0 = ', '5 + 5 = ', '5 + 0 = ', '5 + 4 = ', '9 + 4 = ', '4 + 8 = ', '8 + 2 = ', '0 + 4 = ', '8 + 7 = ', '3 + 7 = ', '7 + 5 = ', '8 + 0 = ', '3 + 8 = ', '1 + 9 = ', '7 + 2 = ', '4 + 9 = ', '6 + 4 = ', '5 + 6 = ', '0 + 9 = ', '5 + 6 = ', '7 + 2 = ', '5 + 8 = ', '2 + 4 = ', '78 + 98 = ', '71 + 87 = ', '71 + 59 = ', '15 + 82 = ', '74 + 16 = ', '28 + 78 = ', '21 + 13 = ', '20 + 66 = ', '23 + 36 = ', '49 + 50 = ', '48 + 71 = ', '33 + 84 = ', '76 + 15 = ', '46 + 59 = ', '25 + 41 = ', '71 + 29 = ', '50 + 92 = ', '26 + 18 = ', '94 + 99 = ', '96 + 25 = ', '13 + 48 = ', '52 + 10 = ', '85 + 61 = ', '75 + 50 = ', '53 + 42 = ', '78 + 53 = ', '82 + 51 = ', '89 + 99 =

In [None]:
# all_heads = [(layer, head) for layer in range(32) for head in range(32)]
# input the circuit to ablate, not what to keep
# perc_score, list_outputs = ablate_circ_autoScore(model, [], sequences_as_str, next_members)
# perc_score

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, intersect_all, sequences_as_str, next_members)
perc_score

2 2
18 18
14 14
10 10
5 5
6 6
4 4
4 4
6 6
14 14
10 10
11 11
5 5
6 6
6 6
7 7
7 7
10 10
8 8<0x0A><0x0A>8+
18 18
17 17
17 17
11 11
8 9<0x0A><0x0A>Yourturn
11 11
6 6
10 10
6 6
10 10
5 5<0x0A><0x0A>5+
9 9
13 13
12 12
10 10
4 4
15 15
10 10
12 12
8 8<0x0A><0x0A>8+
11 11
10 10
9 9
13 13
10 10
11 11
9 9
11 11
9 9
13 13
6 6


KeyboardInterrupt: 

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, nums_1to9, sequences_as_str, next_members)
perc_score

2 1
18 18
14 14
10 12
5 5
6 6
4 4
4 4
6 3
14 14
10 12
11 12
5 3
6 6
6 1
7 1
7 1
10 10
8 8<0x0A><0x0A><0x0A>8
18 18
17 18
17 16
11 12
8 14<0x0A><0x0A><0x0A>
11 12
6 6
10 10
6 6
10 10
5 5<0x0A><0x0A><0x0A>5
9 1
13 10
12 12
10 10
4 4
15 15
10 33
12 12
8 8<0x0A><0x0A><0x0A>8
11 33
10 12
9 1
13 18
10 10
11 11
9 0
11 11
9 1
13 10
6 1
176 88<0x0A>
158 711
130 71<0x0A>
97 11
90 90
106 30<0x0A>
34 14
86 20
59 36
99 49
119 491
117 333
91 76
105 55<0x0A>
66 25
100 711
142 500
44 42
193 94<0x0A>
121 97<0x0A><0x0A> 
61 11
62 62
146 85<0x0A>
125 755
95 53
131 788
133 82<0x0A>
188 99<0x0A>
145 75<0x0A>
96 40
108 60<0x0A>
125 588
151 56<0x0A>
140 140
105 335
139 70<0x0A>
124 1000<0x0A>
77 11
91 71
152 600
83 42
98 71
54 54
129 333
135 733
56 26
138 449
77 61
57 33
162 100<0x0A><0x0A>


0.26

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, nw_circ, sequences_as_str, next_members)
perc_score

2 2
18 18
14 14
10 1+
5 5
6 6
4 4
4 4
6 6
14 14
10 1+
11 10
5 5
6 6
6 1
7 1
7 7
10 10
8 8<0x0A><0x0A>Hinweis:
18 18
17 18
17 10
11 10
8 8<0x0A><0x0A><0x0A>So
11 10
6 6
10 10
6 6
10 10
5 5<0x0A><0x0A><0x0A>Answer
9 1
13 14
12 18
10 10
4 4
15 15
10 10
12 12
8 8<0x0A><0x0A>Hinweis:
11 10
10 1+
9 1
13 10
10 10
11 11
9 9
11 11
9 1
13 10
6 1
176 80<0x0A>
158 50<0x0A>
130 71+
97 12
90 <0x0A><0x0A>
106 156
34 5<0x0A>
86 12
59 10
99 9<0x0A>
119 <0x0A><0x0A> 
117 110
91 12
105 <0x0A><0x0A> 
66 10
100 79<0x0A>
142 50+
44 10
193 94+
121 10<0x0A><0x0A> 
61 10
62 52
146 85+
125 120
95 10
131 115
133 85<0x0A>
188 170
145 70+
96 40
108 8<0x0A><0x0A>
125 125
151 10<0x0A>
140 84+
105 10<0x0A>
139 10<0x0A>
124 10<0x0A><0x0A> 
77 7<0x0A>
91 71
152 60+
83 14
98 19
54 17
129 <0x0A><0x0A><0x0A>
135 10<0x0A>
56 6<0x0A>
138 164
77 63
57 7<0x0A>
162 10<0x0A><0x0A> 


0.28

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, months_circ, sequences_as_str, next_members)
perc_score

2 1
18 18
14 49
10 10
5 1
6 6
4 0
4 0
6 3
14 49
10 10
11 10
5 1
6 1
6 2
7 1
7 1
10 10
8 8<0x0A><0x0A><0x0A><0x0A>
18 18
17 10
17 10
11 10
8 10<0x0A><0x0A><0x0A>
11 10
6 1
10 10
6 6
10 10
5 5<0x0A><0x0A><0x0A><0x0A>
9 1
13 10
12 16
10 10
4 0
15 10
10 3+
12 15
8 8<0x0A><0x0A><0x0A><0x0A>
11 3+
10 10
9 1
13 10
10 16
11 10
9 0
11 10
9 1
13 50
6 2
176 70<0x0A>
158 71+
130 71+


KeyboardInterrupt: 

In [None]:
num_rand_runs = 10
heads_not_overlap = intersect_all
num_heads_rand = 100
num_not_overlap = len(intersect_all)
perc_score, list_outputs = ablate_randcirc_autoScore(model, sequences_as_str, next_members,
                                                    num_rand_runs, heads_not_overlap, num_heads_rand, num_not_overlap)
perc_score

# single digit addition

In [None]:
num_prompts = 50
sequences_as_str, next_members = gen_single_addition_prompts(num_prompts)

Sequences:
['2 + 7 = ', '0 + 5 = ', '2 + 4 = ', '6 + 8 = ', '0 + 2 = ', '3 + 7 = ', '6 + 1 = ', '7 + 9 = ', '8 + 6 = ', '5 + 6 = ', '9 + 0 = ', '2 + 5 = ', '4 + 0 = ', '0 + 9 = ', '6 + 4 = ', '0 + 1 = ', '0 + 3 = ', '1 + 8 = ', '1 + 5 = ', '1 + 6 = ', '7 + 5 = ', '3 + 6 = ', '3 + 0 = ', '9 + 5 = ', '5 + 3 = ', '7 + 0 = ', '9 + 8 = ', '5 + 4 = ', '1 + 9 = ', '1 + 3 = ', '0 + 6 = ', '4 + 7 = ', '1 + 1 = ', '3 + 3 = ', '1 + 4 = ', '2 + 0 = ', '0 + 0 = ', '4 + 1 = ', '4 + 2 = ', '7 + 7 = ', '7 + 8 = ', '4 + 8 = ', '5 + 9 = ', '2 + 2 = ', '8 + 7 = ', '8 + 5 = ', '0 + 4 = ', '1 + 7 = ', '8 + 0 = ', '9 + 4 = ']

Next Members:
['9', '5', '6', '14', '2', '10', '7', '16', '14', '11', '9', '7', '4', '9', '10', '1', '3', '9', '6', '7', '12', '9', '3', '14', '8', '7', '17', '9', '10', '4', '6', '11', '2', '6', '5', '2', '0', '5', '6', '14', '15', '12', '14', '4', '15', '13', '4', '8', '8', '13']


In [None]:
# all_heads = [(layer, head) for layer in range(32) for head in range(32)]
# input the circuit to ablate, not what to keep
# perc_score, list_outputs = ablate_circ_autoScore(model, [], sequences_as_str, next_members)
# perc_score

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, intersect_all, sequences_as_str, next_members)
perc_score

9 9
5 5
6 6
14 14
2 2
10 10
7 7
16 16
14 14
11 11
9 9<0x0A><0x0A>9+
7 7
4 4
9 9
10 10
1 1
3 3
9 9
6 6
7 7
12 12
9 9
3 3
14 14
8 8
7 7<0x0A><0x0A>7+
17 17
9 9
10 10
4 2
6 6
11 11
2 2
6 6
5 5
2 2
0 0
5 5
6 <0x0A>
14 14
15 15
12 12
14 14
4 4
15 15
13 13
4 4
8 8
8 8<0x0A><0x0A>8+
13 13


0.9

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, nums_1to9, sequences_as_str, next_members)
perc_score

9 1
5 5
6 1
14 6+
2 0
10 33
7 1
16 0<0x0A>
14 14
11 11
9 9<0x0A><0x0A><0x0A>9
7 1
4 4
9 0
10 10
1 1
3 0
9 1
6 1
7 1
12 12
9 3
3 3
14 14
8 1
7 7<0x0A><0x0A><0x0A>7
17 18
9 1
10 12
4 1
6 6
11 22
2 1
6 3
5 1
2 2
0 0
5 5
6 8
14 14
15 15
12 12
14 10
4 1
15 15
13 13
4 4
8 1
8 8<0x0A><0x0A><0x0A>8
13 10


0.38

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, nw_circ, sequences_as_str, next_members)
perc_score

9 1
5 5
6 1
14 12
2 2
10 10
7 7
16 19
14 14
11 11
9 9<0x0A><0x0A><0x0A>Answer
7 1
4 4
9 9
10 10
1 1
3 3
9 9
6 1
7 7
12 12
9 1
3 3
14 14
8 8
7 7<0x0A><0x0A><0x0A>Answer
17 18
9 1
10 1+
4 3
6 6
11 12
2 2
6 6
5 3
2 2
0 0
5 5
6 8
14 14
15 14
12 18
14 10
4 1
15 15
13 13
4 4
8 9
8 8<0x0A><0x0A>Hinweis:
13 14


0.54

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, months_circ, sequences_as_str, next_members)
perc_score

9 1
5 0
6 2
14 6+
2 0
10 3+
7 1
16 10
14 12
11 10
9 9<0x0A><0x0A><0x0A><0x0A>
7 1
4 0
9 0
10 16
1 0
3 0
9 1
6 1
7 1
12 15
9 1
3 3
14 15
8 1
7 7<0x0A><0x0A><0x0A><0x0A>
17 10
9 1
10 10
4 1
6 0
11 10
2 1
6 3
5 1
2 0
0 0
5 1
6 1
14 49
15 15
12 16
14 5+
4 2
15 10
13 10
4 0
8 1
8 8<0x0A><0x0A><0x0A><0x0A>
13 10


0.08

In [None]:
num_rand_runs = 10
heads_not_overlap = intersect_all
num_heads_rand = 100
num_not_overlap = len(intersect_all)
perc_score, list_outputs = ablate_randcirc_autoScore(model, sequences_as_str, next_members,
                                                    num_rand_runs, heads_not_overlap, num_heads_rand, num_not_overlap)
perc_score

1.0
0.8
0.9
0.8
1.0
0.9
1.0
1.0
0.7
0.9
0.0
1.0
0.7
0.7
1.0
0.8
0.9
1.0
1.0
1.0
1.0
1.0
0.9
1.0
1.0
0.0
0.9
0.9
0.9
0.9
0.8
1.0
1.0
1.0
0.9
0.8
0.7
1.0
0.9
1.0
0.8
0.9
1.0
0.9
1.0
0.8
0.5
1.0
0.0
0.6


0.8439999999999998

# double digit addition

In [None]:
num_prompts = 50
sequences_as_str, next_members = gen_double_addition_prompts(num_prompts)

Sequences:
['62 + 33 = ', '71 + 40 = ', '72 + 91 = ', '89 + 88 = ', '50 + 46 = ', '19 + 25 = ', '92 + 97 = ', '99 + 12 = ', '64 + 72 = ', '80 + 11 = ', '22 + 37 = ', '11 + 42 = ', '82 + 17 = ', '56 + 95 = ', '36 + 71 = ', '48 + 56 = ', '17 + 66 = ', '83 + 15 = ', '98 + 87 = ', '22 + 39 = ', '65 + 10 = ', '94 + 76 = ', '60 + 33 = ', '65 + 27 = ', '73 + 57 = ', '74 + 87 = ', '58 + 55 = ', '61 + 16 = ', '44 + 11 = ', '32 + 10 = ', '13 + 83 = ', '68 + 84 = ', '74 + 73 = ', '84 + 67 = ', '11 + 37 = ', '29 + 94 = ', '11 + 55 = ', '59 + 24 = ', '49 + 20 = ', '68 + 14 = ', '56 + 41 = ', '31 + 85 = ', '70 + 84 = ', '99 + 78 = ', '33 + 14 = ', '62 + 47 = ', '29 + 29 = ', '70 + 65 = ', '75 + 49 = ', '20 + 28 = ']

Next Members:
['95', '111', '163', '177', '96', '44', '189', '111', '136', '91', '59', '53', '99', '151', '107', '104', '83', '98', '185', '61', '75', '170', '93', '92', '130', '161', '113', '77', '55', '42', '96', '152', '147', '151', '48', '123', '66', '83', '69', '82', '97', '116', '

In [None]:
# all_heads = [(layer, head) for layer in range(32) for head in range(32)]
# input the circuit to ablate, not what to keep
# perc_score, list_outputs = ablate_circ_autoScore(model, [], sequences_as_str, next_members)
# perc_score

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, intersect_all, sequences_as_str, next_members)
perc_score

95 95
111 111
163 <0x0A><0x0A>Please
177 97<0x0A>
96 <0x0A><0x0A>
44 <0x0A><0x0A>
189 <0x0A><0x0A>Pleaseprovidethe
111 <0x0A><0x0A>Pleaseenteryour
136 <0x0A><0x0A>What
91 91
59 <0x0A><0x0A>
53 53
99 99
151 <0x0A><0x0A><0x0A>
107 71+
104 <0x0A><0x0A>Please
83 83
98 98
185 <0x0A><0x0A>Please
61 61
75 75
170 <0x0A><0x0A>Pleaseprovidethe
93 <0x0A><0x0A>
92 92
130 120
161 <0x0A><0x0A>Please
113 <0x0A><0x0A>What
77 77
55 55
42 42
96 98
152 <0x0A><0x0A><0x0A>
147 147
151 <0x0A><0x0A><0x0A>
48 48
123 123
66 66
83 83
69 <0x0A><0x0A>Pleaseenteryour
82 82
97 97
116 116
154 <0x0A><0x0A>Please
177 <0x0A><0x0A>Pleaseenteryour
47 <0x0A><0x0A>
109 <0x0A><0x0A><0x0A>
58 58
135 <0x0A><0x0A>Please
124 <0x0A><0x0A>Pleaseenteryour
48 48


0.46

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, nums_1to9, sequences_as_str, next_members)
perc_score

95 65
111 711
163 72<0x0A>
177 90<0x0A>
96 50
44 12
189 97<0x0A><0x0A><0x0A>
111 100<0x0A><0x0A>
136 70<0x0A>
91 80
59 22
53 11
99 10
151 56<0x0A>
107 336
104 56<0x0A>
83 12
98 83
185 100
61 22
75 65
170 94<0x0A><0x0A><0x0A>
93 66
92 66
130 777
161 74<0x0A>
113 558
77 61
55 55
42 33
96 11
152 700
147 74<0x0A>
151 157
48 11
123 30<0x0A>
66 11
83 60
69 49+ 2
82 70
97 56
116 331
154 704
177 100<0x0A><0x0A>
47 33
109 62+
58 59
135 705
124 76<0x0A><0x0A><0x0A>
48 20


0.02

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, nw_circ, sequences_as_str, next_members)
perc_score

95 65
111 71<0x0A>
163 754
177 170
96 50
44 14
189 <0x0A><0x0A> 92
111 10<0x0A><0x0A><0x0A>
136 10<0x0A>
91 81
59 10
53 5<0x0A>
99 85
151 <0x0A><0x0A> 
107 <0x0A><0x0A><0x0A>
104 <0x0A><0x0A> 
83 13
98 88
185 <0x0A><0x0A> 
61 10
75 6<0x0A>
170 <0x0A><0x0A> 94
93 63
92 <0x0A><0x0A>
130 35<0x0A>
161 560
113 135
77 63
55 5<0x0A>
42 32
96 13
152 140
147 14+
151 <0x0A><0x0A><0x0A>
48 1+
123 140
66 1+
83 14
69 9<0x0A><0x0A><0x0A>What
82 10
97 10
116 31+
154 70+
177 <0x0A><0x0A> 9+
47 10
109 62+
58 18
135 70+
124 <0x0A><0x0A> 10
48 10


0.0

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, months_circ, sequences_as_str, next_members)
perc_score

95 6+
111 71+
163 91<0x0A>
177 8+ 
96 50
44 19
189 10<0x0A><0x0A><0x0A>
111 10<0x0A><0x0A><0x0A>
136 12+
91 80
59 22
53 1+
99 17
151 56<0x0A>
107 36<0x0A>
104 48+
83 17
98 15
185 10<0x0A>
61 22
75 10
170 124<0x0A><0x0A>
93 60
92 12
130 35<0x0A>
161 74+
113 10<0x0A>
77 61
55 44
42 32
96 13
152 16+
147 73+
151 8<0x0A><0x0A>
48 10
123 29+
66 10
83 59
69 49+ 2
82 16
97 56
116 31+
154 70+
177 10<0x0A><0x0A><0x0A>
47 10
109 62+
58 59
135 70+
124 15+ 4
48 20


0.0

In [None]:
num_rand_runs = 10
heads_not_overlap = intersect_all
num_heads_rand = 100
num_not_overlap = len(intersect_all)
perc_score, list_outputs = ablate_randcirc_autoScore(model, sequences_as_str, next_members,
                                                    num_rand_runs, heads_not_overlap, num_heads_rand, num_not_overlap)
perc_score

1.0
0.5
0.7
0.8
1.0
1.0
0.0
0.0
0.9
0.8
0.4
0.8
0.7
0.4
0.7
0.9
0.8
0.9
0.7
0.9
1.0
0.0
0.8
0.5
0.3
0.9
0.7
0.8
0.9
0.9
0.9
0.5
0.8
0.4
0.8
1.0
0.5
1.0
0.0
0.6
0.8
0.6
1.0
0.0
0.7
0.8
1.0
1.0
0.0
0.5


0.672

# s subtraction

In [None]:
num_prompts = 50
sequences_as_str, next_members = gen_s_subtraction_prompts(num_prompts)

Sequences:
['2 - 1 = ', '5 - 1 = ', '9 - 0 = ', '6 - 2 = ', '6 - 3 = ', '8 - 4 = ', '9 - 4 = ', '8 - 2 = ', '6 - 4 = ', '7 - 5 = ', '3 - 2 = ', '3 - 0 = ', '2 - 2 = ', '8 - 6 = ', '9 - 7 = ', '6 - 5 = ', '8 - 5 = ', '5 - 3 = ', '6 - 6 = ', '9 - 2 = ', '9 - 9 = ', '9 - 1 = ', '5 - 2 = ', '3 - 3 = ', '9 - 5 = ', '2 - 0 = ', '8 - 1 = ', '7 - 2 = ', '7 - 1 = ', '8 - 3 = ', '8 - 0 = ', '8 - 7 = ', '9 - 3 = ', '1 - 0 = ', '7 - 0 = ', '4 - 0 = ', '4 - 3 = ', '6 - 1 = ', '9 - 8 = ', '8 - 8 = ', '7 - 6 = ', '0 - 0 = ', '1 - 1 = ', '9 - 6 = ', '3 - 1 = ', '7 - 4 = ', '5 - 0 = ', '4 - 1 = ', '7 - 7 = ', '5 - 4 = ']

Next Members:
['1', '4', '9', '4', '3', '4', '5', '6', '2', '2', '1', '3', '0', '2', '2', '1', '3', '2', '0', '7', '0', '8', '3', '0', '4', '2', '7', '5', '6', '5', '8', '1', '6', '1', '7', '4', '1', '5', '1', '0', '1', '0', '0', '3', '2', '3', '5', '3', '0', '1']


In [None]:
# all_heads = [(layer, head) for layer in range(32) for head in range(32)]
# input the circuit to ablate, not what to keep
perc_score, list_outputs = ablate_circ_autoScore(model, [], sequences_as_str, next_members)
perc_score

1 1
4 4
9 9
4 4
3 3
4 4
5 5
6 6
2 2
2 2
1 1
3 3
0 0
2 2
2 2
1 1
3 3
2 2
0 0
7 7
0 0
8 8
3 3
0 0
4 4
2 2
7 7
5 5
6 6
5 5
8 8
1 1
6 6
1 1
7 7
4 4
1 1
5 5
1 1
0 0
1 1
0 0
0 0
3 3
2 2
3 3
5 5
3 3
0 0
1 1


1.0

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, intersect_all, sequences_as_str, next_members)
perc_score

1 2
4 4
9 9
4 4
3 6
4 <0x0A>
5 9
6 6
2 2
2 </s>
1 3
3 3
0 2
2 <0x0A>
2 </s>
1 6
3 <0x0A>
2 2
0 0
7 9
0 0
8 8
3 3
0 0
4 4
2 2
7 7
5 5
6 6
5 5
8 8
1 </s>
6 6
1 1
7 7
4 4
1 <0x0A>
5 5
1 9
0 0
1 </s>
0 0
0 1
3 9
2 3
3 3
5 5
3 4
0 0
1 <0x0A>


0.58

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, nums_1to9, sequences_as_str, next_members)
perc_score

1 2
4 5
9 9
4 3
3 2
4 2
5 1
6 4
2 1
2 3
1 1
3 3
0 2
2 2
2 9
1 1
3 1
2 1
0 1
7 4
0 9
8 9
3 2
0 1
4 4
2 2
7 8
5 3
6 7
5 2
8 8
1 8
6 3
1 1
7 7
4 4
1 1
5 6
1 9
0 8
1 4
0 0
0 1
3 1
2 3
3 2
5 5
3 4
0 4
1 2


0.28

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, nw_circ, sequences_as_str, next_members)
perc_score

1 2
4 4
9 9
4 4
3 3
4 1
5 1
6 5
2 2
2 1
1 3
3 3
0 2
2 1
2 4
1 1
3 1
2 2
0 2
7 6
0 0
8 9
3 3
0 3
4 4
2 2
7 5
5 5
6 7
5 5
8 8
1 1
6 6
1 1
7 7
4 4
1 1
5 6
1 6
0 0
1 1
0 0
0 1
3 4
2 3
3 1
5 5
3 4
0 0
1 1


0.54

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, months_circ, sequences_as_str, next_members)
perc_score

1 1
4 1
9 9
4 1
3 1
4 1
5 1
6 1
2 6
2 1
1 1
3 0
0 2
2 1
2 1
1 1
3 1
2 1
0 6
7 1
0 9
8 1
3 1
0 1
4 4
2 0
7 1
5 1
6 1
5 1
8 8
1 1
6 1
1 0
7 7
4 0
1 1
5 1
1 8
0 8
1 1
0 0
0 0
3 1
2 1
3 1
5 0
3 1
0 7
1 1


0.26

In [None]:
num_rand_runs = 10
heads_not_overlap = intersect_all
num_heads_rand = 100
num_not_overlap = len(intersect_all)
perc_score, list_outputs = ablate_randcirc_autoScore(model, sequences_as_str, next_members,
                                                    num_rand_runs, heads_not_overlap, num_heads_rand, num_not_overlap)
perc_score

1.0
0.9
1.0
1.0
1.0
0.9
0.9
1.0
0.9
0.9
0.8
0.6
0.8
0.9
0.9
1.0
0.9
1.0
0.8
0.9
0.7
0.9
0.7
0.7
0.9
0.7
1.0
1.0
0.9
0.9
1.0
0.8
1.0
0.4
0.9
0.3
0.9
1.0
1.0
0.8
0.8
1.0
0.9
1.0
1.0
0.9
0.9
0.9
0.8
1.0


0.8759999999999997

# d subtraction

In [None]:
num_prompts = 50
sequences_as_str, next_members = gen_d_subtraction_prompts(num_prompts)

Sequences:
['81 - 29 = ', '93 - 39 = ', '45 - 18 = ', '55 - 22 = ', '89 - 88 = ', '52 - 49 = ', '64 - 18 = ', '49 - 35 = ', '61 - 46 = ', '95 - 57 = ', '75 - 29 = ', '98 - 86 = ', '69 - 11 = ', '91 - 40 = ', '98 - 34 = ', '28 - 12 = ', '83 - 76 = ', '97 - 90 = ', '81 - 35 = ', '52 - 20 = ', '97 - 44 = ', '94 - 20 = ', '60 - 51 = ', '44 - 32 = ', '99 - 92 = ', '76 - 52 = ', '91 - 79 = ', '75 - 25 = ', '70 - 25 = ', '80 - 75 = ', '52 - 18 = ', '38 - 22 = ', '93 - 82 = ', '39 - 36 = ', '55 - 21 = ', '71 - 39 = ', '93 - 20 = ', '60 - 28 = ', '36 - 23 = ', '83 - 18 = ', '57 - 25 = ', '78 - 27 = ', '50 - 40 = ', '71 - 26 = ', '82 - 52 = ', '94 - 88 = ', '95 - 54 = ', '44 - 37 = ', '73 - 63 = ', '37 - 29 = ']

Next Members:
['52', '54', '27', '33', '1', '3', '46', '14', '15', '38', '46', '12', '58', '51', '64', '16', '7', '7', '46', '32', '53', '74', '9', '12', '7', '24', '12', '50', '45', '5', '34', '16', '11', '3', '34', '32', '73', '32', '13', '65', '32', '51', '10', '45', '30', '6', '41',

In [None]:
# all_heads = [(layer, head) for layer in range(32) for head in range(32)]
# input the circuit to ablate, not what to keep
perc_score, list_outputs = ablate_circ_autoScore(model, [], sequences_as_str, next_members)
perc_score

52 52
54 54
27 27
33 33
1 1
3 3
46 46
14 14
15 15
38 38
46 46
12 12
58 58
51 51
64 64
16 16
7 7
7 7
46 46
32 32
53 53
74 74
9 9
12 12
7 7
24 24
12 12
50 50
45 45
5 5
34 34
16 16
11 11
3 3
34 34
32 32
73 73
32 32
13 13
65 65
32 32
51 51
10 10
45 45
30 30
6 6
41 41
7 7
10 10
8 8


1.0

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, intersect_all, sequences_as_str, next_members)
perc_score

52 52
54 54
27 27
33 33
1 0
3 7
46 46
14 14
15 15
38 38
46 46
12 12
58 59
51 51
64 64
16 14
7 8
7 8
46 46
32 32
53 53
74 74
9 8
12 12
7 1
24 24
12 12
50 50
45 45
5 <0x0A>
34 34
16 18
11 11
3 6
34 34
32 32
73 73
32 32
13 13
65 65
32 32
51 51
10 10
45 45
30 30
6 <0x0A>
41 41
7 <0x0A>
10 10
8 9


0.72

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, nums_1to9, sequences_as_str, next_members)
perc_score

52 81
54 93
27 45
33 27
1 8
3 1
46 64
14 15
15 13
38 95
46 75
12 10
58 6<0x0A>
51 91
64 32
16 28
7 8
7 1
46 81
32 26
53 97
74 94
9 1
12 44
7 1
24 14
12 91
50 30
45 70
5 1
34 52
16 16
11 10
3 3
34 25
32 71
73 93
32 24
13 10
65 83
32 25
51 78
10 12
45 71
30 79
6 8
41 95
7 1
10 73
8 3


0.04

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, nw_circ, sequences_as_str, next_members)
perc_score

52 73
54 24
27 20
33 3<0x0A>
1 7
3 3
46 24
14 14
15 56
38 4<0x0A>
46 7<0x0A>
12 12
58 6<0x0A>
51 91
64 24
16 6<0x0A>
7 5
7 1
46 26
32 52
53 23
74 94
9 5
12 2<0x0A>
7 9
24 4<0x0A>
12 53
50 25
45 65
5 7
34 42
16 5<0x0A>
11 13
3 5
34 4<0x0A>
32 32
73 93
32 52
13 3<0x0A>
65 53
32 4<0x0A>
51 5<0x0A>
10 50
45 56
30 82
6 5
41 95
7 4
10 21
8 5


0.08

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, months_circ, sequences_as_str, next_members)
perc_score

52 81
54 39
27 45
33 10
1 8
3 2
46 6<0x0A>
14 49
15 46
38 35
46 29
12 86
58 11
51 40
64 34
16 28
7 3
7 7
46 81
32 10
53 44
74 10
9 6
12 44
7 9
24 36
12 79
50 5<0x0A>
45 70
5 8
34 18
16 18
11 32
3 3
34 15
32 39
73 10
32 60
13 12
65 18
32 25
51 7<0x0A>
10 10
45 71
30 82
6 4
41 54
7 4
10 33
8 3


0.06

In [None]:
num_rand_runs = 10
heads_not_overlap = intersect_all
num_heads_rand = 100
num_not_overlap = len(intersect_all)
perc_score, list_outputs = ablate_randcirc_autoScore(model, sequences_as_str, next_members,
                                                    num_rand_runs, heads_not_overlap, num_heads_rand, num_not_overlap)
perc_score

0.9
0.7
0.6
0.9
0.9
0.8
0.9
1.0
1.0
0.6
1.0
0.8
0.7
0.9
0.6
0.7
0.6
0.8
1.0
0.7
0.9
0.8
0.6
0.9
0.9
0.7
0.9
1.0
0.9
0.7
1.0
0.8
0.9
0.9
0.7
0.6
1.0
0.4
1.0
0.6
0.9
0.9
0.9
0.8
0.8
0.7
0.7
0.6
0.8
0.6


0.7999999999999997