# Setup

In [None]:
save_files = True

In [None]:
%%capture
%pip install git+https://github.com/neelnanda-io/TransformerLens.git

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import einops
from fancy_einsum import einsum
import tqdm.notebook as tqdm
import random
from pathlib import Path
# import plotly.express as px
from torch.utils.data import DataLoader

from jaxtyping import Float, Int
from typing import List, Union, Optional
from functools import partial
import copy

import itertools
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
import dataclasses
import datasets
from IPython.display import HTML

import pickle
from google.colab import files

import matplotlib.pyplot as plt
import statistics

In [None]:
import transformer_lens
import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookedRootModule,
    HookPoint,
)  # Hooking utilities
from transformer_lens import HookedTransformer #, HookedTransformerConfig, FactoredMatrix, ActivationCache

We turn automatic differentiation off, to save GPU memory, as this notebook focuses on model inference not model training.

In [None]:
torch.set_grad_enabled(False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import pdb

## Import functions from repo

In [None]:
!git clone https://github.com/apartresearch/seqcont_circuits.git
%cd /content/seqcont_circuits/src/iter_node_pruning

In [None]:
## comment this out when debugging functions in colab to use funcs defined in colab

# don't improt this
# # from dataset import Dataset

from metrics import *
from head_ablation_fns import *
from mlp_ablation_fns import *
from node_ablation_fns import *
from loop_node_ablation_fns import *

## fns

In [None]:
import random


In [None]:
class Dataset:
    def __init__(self, prompts, pos_dict, tokenizer):  # , S1_is_first=False
        self.prompts = prompts
        self.tokenizer = tokenizer
        self.N = len(prompts)
        self.max_len = max(
            [
                len(self.tokenizer(prompt["text"]).input_ids)
                for prompt in self.prompts
            ]
        )
        all_ids = [0 for prompt in self.prompts] # only 1 template
        all_ids_ar = np.array(all_ids)
        self.groups = []
        for id in list(set(all_ids)):
            self.groups.append(np.where(all_ids_ar == id)[0])

        texts = [ prompt["text"] for prompt in self.prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )
        self.corr_tokenIDs = [
            # self.tokenizer.encode(" " + prompt["corr"])[0] for prompt in self.prompts
            self.tokenizer.encode(prompt["corr"])[0] for prompt in self.prompts
        ]
        self.incorr_tokenIDs = [
            # self.tokenizer.encode(" " + prompt["incorr"])[0] for prompt in self.prompts
            self.tokenizer.encode(prompt["incorr"])[0] for prompt in self.prompts
        ]

        # word_idx: for every prompt, find the token index of each target token and "end"
        # word_idx is a dict whose values are tensor with an element for each prompt. The element is the targ token's ind at that prompt
        self.word_idx = {}
        # for targ in [key for key in self.prompts[0].keys() if (key != 'text' and key != 'corr' and key != 'incorr')]:
        for targ in [key for key in pos_dict]:
            targ_lst = []
            for prompt in self.prompts:
                input_text = prompt["text"]
                tokens = self.tokenizer.tokenize(input_text)
                # if S1_is_first and targ == "S1":  # only use this if first token doesn't have space Ġ in front
                #     target_token = prompt[targ]
                # else:
                #     target_token = "Ġ" + prompt[targ]
                # target_index = tokens.index(target_token)
                target_index = pos_dict[targ]
                targ_lst.append(target_index)
            self.word_idx[targ] = torch.tensor(targ_lst)

        targ_lst = []
        for prompt in self.prompts:
            input_text = prompt["text"]
            tokens = self.tokenizer.tokenize(input_text)
            end_token_index = len(tokens) - 1
            targ_lst.append(end_token_index)
        self.word_idx["end"] = torch.tensor(targ_lst)

    def __len__(self):
        return self.N

In [None]:
def generate_prompts_list_longer(text, tokens):
    prompts_list = []
    prompt_dict = {
        'corr': str(1),
        'incorr': str(2),
        'text': text
        # 'text': model.to_string(tokens)[0]
        }
    tokens_as_strs = model.tokenizer.tokenize(text)
    # tokens_as_strs = model.to_string(tokens)[0].split()
    # for i in range(tokens.shape[1]):
    for i, tok in enumerate(tokens_as_strs):
        prompt_dict['S'+str(i)] = tok
    # for i, tok in enumerate(tokens):
    #     prompt_dict['S'+str(i)] = model.to_string(tok)

    # prompt_dict = {
    #     'corr': '4',
    #     'incorr': '3',
    #     'text': model.to_string(tokens)[0]
    # }
    # # list_tokens = tokenizer.tokenize('1 2 3 ')
    # tokens_as_strs = model.to_string(tokens)[0].split()
    # for i, tok_as_str in enumerate(tokens_as_strs):
    #     if tok_as_str == '▁':
    #         prompt_dict['S'+str(i)] = ' '
    #     else:
    #         prompt_dict['S'+str(i)] = tok_as_str
    prompts_list.append(prompt_dict)
    return prompts_list

# Load Model

In [None]:
from transformers import LlamaForCausalLM, LlamaTokenizer

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
LLAMA_2_7B_CHAT_PATH = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = LlamaTokenizer.from_pretrained(LLAMA_2_7B_CHAT_PATH)
# tokenizer = LlamaTokenizer.from_pretrained(LLAMA_2_7B_CHAT_PATH, use_fast= False, add_prefix_space= False)
hf_model = LlamaForCausalLM.from_pretrained(LLAMA_2_7B_CHAT_PATH, low_cpu_mem_usage=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [None]:
import transformer_lens.utils as utils
from transformer_lens.hook_points import HookPoint
from transformer_lens import HookedTransformer

In [None]:
model = HookedTransformer.from_pretrained(
    LLAMA_2_7B_CHAT_PATH,
    hf_model = hf_model,
    tokenizer = tokenizer,
    device = "cpu",
    fold_ln = False,
    center_writing_weights = False,
    center_unembed = False,
)

del hf_model

model = model.to("cuda" if torch.cuda.is_available() else "cpu")

Loaded pretrained model meta-llama/Llama-2-7b-chat-hf into HookedTransformer
Moving model to device:  cuda


# new ablation functions

In [None]:
def get_heads_actv_mean(
    means_dataset: Dataset,
    model: HookedTransformer
) -> Float[Tensor, "layer batch seq head_idx d_head"]:
    '''
    Output: The mean activations of a head's output
    '''
    _, means_cache = model.run_with_cache(
        means_dataset.toks.long(),
        return_type=None,
        names_filter=lambda name: name.endswith("z"),
    )
    n_layers, n_heads, d_head = model.cfg.n_layers, model.cfg.n_heads, model.cfg.d_head
    batch, seq_len = len(means_dataset), means_dataset.max_len
    means = t.zeros(size=(n_layers, batch, seq_len, n_heads, d_head), device=model.cfg.device)

    # for layer in range(model.cfg.n_layers):
    #     z_for_this_layer: Float[Tensor, "batch seq head d_head"] = means_cache[utils.get_act_name("z", layer)]
    #     for template_group in means_dataset.groups:
    #         z_for_this_template = z_for_this_layer[template_group]
    #         z_means_for_this_template = einops.reduce(z_for_this_template, "batch seq head d_head -> seq head d_head", "mean")
    #         if z_means_for_this_template.shape[0] == 5:
    #             pdb.set_trace()
    #         means[layer, template_group] = z_means_for_this_template

    del(means_cache)

    return means

In [None]:
# def mask_circ_heads(
#     means_dataset: Dataset,
#     model: HookedTransformer,
#     circuit: Dict[str, List[Tuple[int, int]]],
#     seq_pos_to_keep: Dict[str, str],
# ) -> Dict[int, Bool[Tensor, "batch seq head"]]:
#     '''
#     Output: for each layer, a mask of circuit components that should not be ablated
#     '''
#     heads_and_posns_to_keep = {}
#     batch, seq, n_heads = len(means_dataset), means_dataset.max_len, model.cfg.n_heads

#     for layer in range(model.cfg.n_layers):

#         mask = t.zeros(size=(batch, seq, n_heads))

#         for (head_type, head_list) in circuit.items():
#             seq_pos = seq_pos_to_keep[head_type]
#             # if seq_pos == 'S7':
#             #     pdb.set_trace()
#             indices = means_dataset.word_idx[seq_pos] # modify this for key vs query pos. curr, this is query
#             for (layer_idx, head_idx) in head_list:
#                 if layer_idx == layer:
#                     # if indices.item() == 7:
#                     #     pdb.set_trace()
#                     mask[:, indices, head_idx] = 1
#                     # mask[:, :, head_idx] = 1  # keep L.H at all pos

#         heads_and_posns_to_keep[layer] = mask.bool()
#     # pdb.set_trace()
#     return heads_and_posns_to_keep

In [None]:
def mask_circ_heads(
    means_dataset: Dataset,
    model: HookedTransformer,
    circuit: Dict[str, List[Tuple[int, int]]],
    seq_pos_to_keep: Dict[str, str],
) -> Dict[int, Bool[Tensor, "batch seq head"]]:
    '''
    Output: for each layer, a mask of circuit components that should not be ablated
    '''
    heads_and_posns_to_keep = {}
    # batch, seq, n_heads = len(means_dataset), means_dataset.max_len, model.cfg.n_heads
    batch, seq, n_heads = len(means_dataset), len(circuit.keys()), model.cfg.n_heads
    # print(seq)

    for layer in range(model.cfg.n_layers):

        mask = t.zeros(size=(batch, seq, n_heads))

        for (head_type, head_list) in circuit.items():
            seq_pos = seq_pos_to_keep[head_type]
            indices = means_dataset.word_idx[seq_pos] # modify this for key vs query pos. curr, this is query
            for (layer_idx, head_idx) in head_list:
                if layer_idx == layer:
                    # mask[:, indices, head_idx] = 1
                    mask[:, :, head_idx] = 1

        heads_and_posns_to_keep[layer] = mask.bool()

    return heads_and_posns_to_keep

In [None]:
def hook_func_mask_head(
    z: Float[Tensor, "batch seq head d_head"],
    hook: HookPoint,
    # components_to_keep: Dict[int, Bool[Tensor, "batch seq head"]],
    # means: Float[Tensor, "layer batch seq head d_head"],
    circuit: Dict[str, List[Tuple[int, int]]],
) -> Float[Tensor, "batch seq head d_head"]:
    '''
    Use this to not mask components
    '''
    # mask_for_this_layer = components_to_keep[hook.layer()].unsqueeze(-1).to(z.device)
    # z = t.where(mask_for_this_layer, z, means[hook.layer()])

    ###
    # heads_and_posns_to_keep = {}
    # batch, seq, n_heads = z.shape[0], z.shape[1], model.cfg.n_heads  # components_to_keep[0].shape[0] is batch

    # for layer in range(model.cfg.n_layers):

    #     mask = t.zeros(size=(batch, seq, n_heads))

    #     for (head_type, head_list) in circuit.items():
    #         # seq_pos = seq_pos_to_keep[head_type]
    #         # indices = means_dataset.word_idx[seq_pos] # modify this for key vs query pos. curr, this is query
    #         for (layer_idx, head_idx) in head_list:
    #             if layer_idx == layer:
    #                 # mask[:, indices, head_idx] = 1
    #                 mask[:, :, head_idx] = 1

    #     heads_and_posns_to_keep[layer] = mask.bool()
    ###
    mask_for_this_layer = t.zeros(size=(z.shape[0], z.shape[1], z.shape[2]))
    for (head_type, head_list) in circuit.items():
        # seq_pos = seq_pos_to_keep[head_type]
        # indices = means_dataset.word_idx[seq_pos] # modify this for key vs query pos. curr, this is query
        for (layer_idx, head_idx) in head_list:
            if layer_idx == hook.layer():
                # mask[:, indices, head_idx] = 1
                mask_for_this_layer[:, :, head_idx] = 1

    mask_for_this_layer = mask_for_this_layer.bool()
    mask_for_this_layer = mask_for_this_layer.unsqueeze(-1).to(z.device)  # d_model is 1; then is broadcast in where

    z = t.where(mask_for_this_layer, z, 0)

    return z

In [None]:
def add_ablation_hook_head(
    model: HookedTransformer,
    means_dataset: Dataset,
    circuit: Dict[str, List[Tuple[int, int]]],
    seq_pos_to_keep: Dict[str, str],
    is_permanent: bool = True,
) -> HookedTransformer:
    '''
    Ablate the model, except as components and positions to keep
    '''

    model.reset_hooks(including_permanent=True)
    means = get_heads_actv_mean(means_dataset, model)
    components_to_keep = mask_circ_heads(means_dataset, model, circuit, seq_pos_to_keep)

    hook_fn = partial(
        hook_func_mask_head,
        # components_to_keep=components_to_keep,
        # means=means,
        circuit=circuit,
    )

    model.add_hook(lambda name: name.endswith("z"), hook_fn, is_permanent=is_permanent)
    return model

In [None]:
# from dataset import Dataset
from transformer_lens import HookedTransformer, utils
from transformer_lens.hook_points import HookPoint
import einops
from functools import partial
import torch as t
from torch import Tensor
from typing import Dict, Tuple, List
from jaxtyping import Float, Bool

# from head_ablation_fns import *
# from mlp_ablation_fns import *

def add_ablation_hook_MLP_head(
    model: HookedTransformer,
    means_dataset: Dataset,
    heads_lst, mlp_lst,
    is_permanent: bool = True,
) -> HookedTransformer:
    CIRCUIT = {}
    SEQ_POS_TO_KEEP = {}
    # for i in range(len(model.tokenizer.tokenize(means_dataset.prompts[0]['text']))):
    num_pos = len(model.tokenizer(means_dataset.prompts[0]['text']).input_ids)
    for i in range(num_pos ):
        CIRCUIT['S'+str(i)] = heads_lst
        # if i == len(model.tokenizer.tokenize(means_dataset.prompts[0]['text'])) - 1:
        # if i == num_pos - 1:
        #     SEQ_POS_TO_KEEP['S'+str(i)] = 'end'
        # else:
        SEQ_POS_TO_KEEP['S'+str(i)] = 'S'+str(i)

    model.reset_hooks(including_permanent=True)

    # Compute the mean of each head's output on the ABC dataset, grouped by template
    means = get_heads_actv_mean(means_dataset, model)
    # Convert this into a boolean map
    components_to_keep = mask_circ_heads(means_dataset, model, CIRCUIT, SEQ_POS_TO_KEEP)

    # Get a hook function which will patch in the mean z values for each head, at
    # all positions which aren't important for the circuit
    hook_fn = partial(
        hook_func_mask_head,
        # components_to_keep=components_to_keep,
        # means=means,
        circuit=CIRCUIT,
    )

    # Apply hook
    model.add_hook(lambda name: name.endswith("z"), hook_fn, is_permanent=is_permanent)

    # if all_entries_true(components_to_keep) == False:
    #     pdb.set_trace()
    ########################
    # CIRCUIT = {}
    # SEQ_POS_TO_KEEP = {}
    # # for i in range(len(model.tokenizer.tokenize(means_dataset.prompts[0]['text']))):
    # num_pos = len(model.tokenizer(means_dataset.prompts[0]['text']).input_ids)
    # for i in range(num_pos ):
    #     CIRCUIT['S'+str(i)] = mlp_lst
    #     # if i == len(model.tokenizer.tokenize(means_dataset.prompts[0]['text'])) - 1:
    #     # if i == num_pos - 1:
    #     #     SEQ_POS_TO_KEEP['S'+str(i)] = 'end'
    #     # else:
    #     SEQ_POS_TO_KEEP['S'+str(i)] = 'S'+str(i)

    # # Compute the mean of each head's output on the ABC dataset, grouped by template
    # means = get_MLPs_actv_mean(means_dataset, model)

    # # Convert this into a boolean map
    # components_to_keep = mask_circ_MLPs(means_dataset, model, CIRCUIT, SEQ_POS_TO_KEEP)

    # # Get a hook function which will patch in the mean z values for each head, at
    # # all positions which aren't important for the circuit
    # hook_fn = partial(
    #     hook_func_mask_mlp_out,
    #     components_to_keep=components_to_keep,
    #     means=means
    # )

    # model.add_hook(lambda name: name.endswith("mlp_out"), hook_fn, is_permanent=True)

    return model

In [None]:
def all_entries_true(tensor_dict):
    for key, tensor in tensor_dict.items():
        if not torch.all(tensor).item():
            return False
    return True

# ablation fns mult tok answers

In [None]:
def clean_gen(model, clean_text, corr_ans):
    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook
    tokens = model.to_tokens(clean_text).to(device)
    # tokens = tokens[:, 1:] # get rid of prepend bos when using model.to_tokens

    total_score = 0
    corr_ans_tokLen = 0
    ans_so_far = ''
    # while True:
    for i in range(10):
        # print(f"Sequence so far: {model.to_string(tokens)[0]!r}")
        logits = model(tokens)
        next_token = logits[0, -1].argmax(dim=-1) # Get the predicted token at the end of our sequence
        next_char = model.to_string(next_token)

        corr_logits = logits[:, -1, next_token]
        total_score += corr_logits
        # print(f"logit diff of new char: {corr_logits}")

        ans_so_far += next_char
        corr_ans_tokLen += 1
        # print(f"{tokens.shape[-1]+1}th char = {next_char!r}")
        if ans_so_far == corr_ans:
            # print('\nTotal logit diff: ', total_score.item())
            break

        # Define new input sequence, by appending the previously generated token
        tokens = torch.cat([tokens, next_token[None, None]], dim=-1)
        # if next_char == '':
        #     next_char = ' '
        # clean_text = clean_text + next_char
        # tokens = model.to_tokens(clean_text).to(device)
    # return corr_ans_tokLen, total_score
    return total_score.item()

In [None]:
def ablate_then_gen(model, clean_text, corr_text, heads_not_ablate, mlps_not_ablate, corr_ans_tokLen):
    tokens = model.to_tokens(clean_text).to(device)
    prompts_list = generate_prompts_list_longer(clean_text, tokens)

    corr_tokens = model.to_tokens(corr_text).to(device)
    prompts_list_2 = generate_prompts_list_longer(corr_text, corr_tokens)

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook
    pos_dict = {}
    num_pos = len(model.tokenizer(prompts_list_2[0]['text']).input_ids)
    for i in range(num_pos ):
        pos_dict['S'+str(i)] = i
    dataset_2 = Dataset(prompts_list_2, pos_dict, model.tokenizer)
    model = add_ablation_hook_MLP_head(model, dataset_2, heads_not_ablate, mlps_not_ablate)

    logits = model(tokens)
    next_token = logits[0, -1].argmax(dim=-1)
    next_char = model.to_string(next_token)

    total_score = 0

    for i in range(corr_ans_tokLen):
        if next_char == '':
            next_char = ' '

        clean_text = clean_text + next_char
        if i == corr_ans_tokLen - 1:
            print(model.to_string(tokens))
            # print(f"Sequence so far: {clean_text}")
            # print(f"{tokens.shape[-1]+1}th char = {next_char!r}")

        tokens = torch.cat([tokens, next_token[None, None]], dim=-1)

        # get new ablation dataset
        # model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

        # corr_text = corr_text + next_char
        # corr_tokens = torch.cat([corr_tokens, next_token[None, None]], dim=-1)
        # prompts_list_2 = generate_prompts_list_longer(corr_text, corr_tokens)

        # pos_dict = {}
        # num_pos = len(model.tokenizer(prompts_list_2[0]['text']).input_ids)
        # for i in range(num_pos ):
        #     pos_dict['S'+str(i)] = i

        # dataset_2 = Dataset(prompts_list_2, pos_dict, model.tokenizer, corr_tokens)

        # model = add_ablation_hook_MLP_head(model, dataset_2, heads_not_ablate, mlps_not_ablate)

        logits = model(tokens)
        next_token = logits[0, -1].argmax(dim=-1) # Get the predicted token at the end of our sequence
        next_char = model.to_string(next_token)

        # new_score = get_logit_diff(logits, dataset)
        # total_score += new_score
        # print(f"corr logit of new char: {new_score}")
    # print('\n Total corr logit: ', total_score.item())

In [None]:
answer_str = '10'
ans_str_tok = tokenizer.tokenize(answer_str)[1:]
ans_str_tok

['1', '0']

In [None]:
corr_tokenIDs = []
for ansPos in range(len(ans_str_tok)):
    # ansPos_corrTokIDS = [] # this is the inner list. each member is a promptID
    # for promptID in range(len(self.prompts)):
    #     tokID = self.tokenizer.encode(self.prompts[promptID]['corr'][ansPos])[2:][0] # 2: to skip padding <s> and ''
    #     ansPos_corrTokIDS.append(tokID)
    # self.corr_tokenIDs.append(ansPos_corrTokIDS)

    tokID = model.tokenizer.encode(ans_str_tok[ansPos])[2:][0] # 2: to skip padding <s> and ''
    corr_tokenIDs.append(tokID)
corr_tokenIDs

[29896, 29900]

In [None]:
def ablate_auto_score(model, clean_text, corr_text, heads_not_ablate, mlps_not_ablate, correct_ans):
    tokens = model.to_tokens(clean_text).to(device)
    prompts_list = generate_prompts_list_longer(clean_text, tokens)

    corr_tokens = model.to_tokens(corr_text).to(device)
    prompts_list_2 = generate_prompts_list_longer(corr_text, corr_tokens)

    model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook
    pos_dict = {}
    num_pos = len(model.tokenizer(prompts_list_2[0]['text']).input_ids)
    for i in range(num_pos ):
        pos_dict['S'+str(i)] = i
    dataset_2 = Dataset(prompts_list_2, pos_dict, model.tokenizer)
    model = add_ablation_hook_MLP_head(model, dataset_2, heads_not_ablate, mlps_not_ablate)

    # logits = model(tokens)
    # next_token = logits[0, -1].argmax(dim=-1)
    # next_char = model.to_string(next_token)

    total_score = 0
    ans_so_far = ''
    ans_str_tok = tokenizer.tokenize(correct_ans)[1:] # correct_ans is str
    corr_tokenIDs = []
    for correct_ansPos in range(len(ans_str_tok)):
        tokID = model.tokenizer.encode(ans_str_tok[correct_ansPos])[2:][0] # 2: to skip padding <s> and ''
        corr_tokenIDs.append(tokID)
    correct_ans_tokLen = len(corr_tokenIDs)
    for ansPos in range(correct_ans_tokLen):
        # if next_char == '':
        #     next_char = ' '

        # clean_text = clean_text + next_char
        # if i == correct_ans_tokLen - 1:
        #     print(model.to_string(tokens))
        #     # print(f"Sequence so far: {clean_text}")
        #     # print(f"{tokens.shape[-1]+1}th char = {next_char!r}")

        # tokens = torch.cat([tokens, next_token[None, None]], dim=-1)

        # get new ablation dataset
        # model.reset_hooks(including_permanent=True)  #must do this after running with mean ablation hook

        # corr_text = corr_text + next_char
        # corr_tokens = torch.cat([corr_tokens, next_token[None, None]], dim=-1)
        # prompts_list_2 = generate_prompts_list_longer(corr_text, corr_tokens)

        # pos_dict = {}
        # num_pos = len(model.tokenizer(prompts_list_2[0]['text']).input_ids)
        # for i in range(num_pos ):
        #     pos_dict['S'+str(i)] = i

        # dataset_2 = Dataset(prompts_list_2, pos_dict, model.tokenizer, corr_tokens)

        # model = add_ablation_hook_MLP_head(model, dataset_2, heads_not_ablate, mlps_not_ablate)

        logits = model(tokens)
        next_token = logits[0, -1].argmax(dim=-1) # Get the predicted token at the end of our sequence
        next_char = model.to_string(next_token)

        if next_char == '':
            next_char = ' '

        clean_text = clean_text + next_char
        # if i == correct_ans_tokLen - 1:
            # print(model.to_string(tokens))
            # print(f"Sequence so far: {clean_text}")
            # print(f"{tokens.shape[-1]+1}th char = {next_char!r}")

        tokens = torch.cat([tokens, next_token[None, None]], dim=-1)

        ans_so_far += next_char
        correct_ans_tokLen += 1
        # print(f"{tokens.shape[-1]+1}th char = {next_char!r}")

        ansTok_IDs = torch.tensor(corr_tokenIDs[ansPos])

        # new_score = get_logit_diff(logits, dataset)
        # total_score += new_score
        # corrTok_logits = logits[:, -1, next_token]
        corrTok_logits = logits[range(logits.size(0)), -1, ansTok_IDs]  # not next_token, as that's what's pred, not the token to measure
        # pdb.set_trace()
        total_score += corrTok_logits
        # print(f"corr logit of new char: {new_score}")
    # print('\n Total corr logit: ', total_score.item())
    # return ans_so_far, total_score.item()
    return ans_so_far

# Define circs

In [None]:
# from Llama2_numerals_1to10.ipynb
nums_1to9 = [(0, 2), (0, 5), (0, 6), (0, 15), (1, 15), (1, 28), (2, 13), (2, 24), (3, 24), (4, 3), (4, 16), (5, 11), (5, 13), (5, 15), (5, 16), (5, 23), (5, 25), (5, 27), (6, 11), (6, 14), (6, 20), (6, 23), (6, 24), (6, 26), (6, 28), (6, 30), (6, 31), (7, 0), (7, 13), (7, 21), (7, 30), (8, 0), (8, 2), (8, 12), (8, 15), (8, 26), (8, 27), (8, 30), (8, 31), (9, 15), (9, 16), (9, 23), (9, 26), (9, 27), (9, 29), (9, 31), (10, 1), (10, 13), (10, 18), (10, 23), (10, 29), (11, 7), (11, 8), (11, 9), (11, 17), (11, 18), (11, 25), (11, 28), (12, 18), (12, 19), (12, 23), (12, 27), (13, 6), (13, 11), (13, 20), (14, 18), (14, 19), (14, 20), (14, 21), (16, 0), (18, 19), (18, 21), (18, 25), (18, 26), (18, 31), (19, 28), (20, 17), (21, 0), (21, 2), (22, 18), (22, 20), (22, 25), (23, 27), (26, 2)]
len(nums_1to9)

84

In [None]:
# nw_circ = [(0, 1), (0, 4), (0, 6), (0, 7), (0, 8), (0, 10), (0, 11), (0, 12), (1, 16), (1, 24), (1, 27), (1, 28), (2, 2), (2, 5), (2, 8), (2, 24), (2, 30), (3, 7), (3, 14), (3, 19), (3, 23), (4, 3), (5, 16), (5, 25), (6, 11), (6, 14), (7, 0), (7, 30), (8, 0), (8, 2), (8, 3), (8, 4), (8, 6), (8, 21), (8, 31), (9, 1), (9, 3), (9, 7), (9, 11), (9, 29), (9, 31), (10, 13), (10, 18), (10, 23), (10, 24), (10, 25), (10, 27), (11, 18), (11, 28), (12, 18), (12, 26), (13, 11), (13, 17), (13, 18), (13, 19), (13, 20), (13, 21), (13, 23), (14, 7), (14, 14), (15, 25), (15, 28), (16, 0), (16, 12), (16, 14), (16, 15), (16, 16), (16, 19), (16, 24), (16, 29), (17, 17), (17, 23), (17, 31), (18, 31), (19, 12), (20, 17), (27, 20), (27, 25), (27, 27), (27, 31), (28, 5), (29, 5)]
# in order from most impt to least based on how much changes perf when ablated
nw_circ = [(20, 17), (5, 25), (16, 0), (29, 5), (3, 19), (6, 11), (15, 25), (8, 0), (16, 24), (8, 4), (7, 0), (6, 14), (16, 29), (5, 16), (12, 26), (4, 3), (3, 7), (7, 30), (11, 28), (28, 5), (17, 31), (13, 11), (13, 20), (12, 18), (1, 27), (10, 13), (18, 31), (8, 6), (9, 1), (0, 4), (2, 2), (9, 11), (19, 12), (1, 16), (13, 17), (9, 7), (11, 18), (2, 24), (10, 18), (9, 31), (9, 29), (2, 30), (2, 5), (1, 24), (2, 8), (15, 28), (27, 31), (16, 14), (3, 23), (3, 14), (10, 23), (27, 20), (8, 3), (14, 7), (14, 14), (16, 15), (8, 2), (17, 17), (0, 1), (10, 27), (16, 19), (0, 8), (0, 12), (1, 28), (0, 11), (17, 23), (0, 10), (0, 6), (13, 19), (8, 31), (10, 24), (16, 12), (13, 23), (13, 21), (27, 27), (9, 3), (27, 25), (16, 16), (8, 21), (0, 7), (13, 18), (10, 25)]
len(nw_circ)

82

In [None]:
# impt_months_heads = ([(23, 17), (17, 11), (16, 0), (26, 14), (18, 9), (5, 25), (22, 20), (6, 24), (26, 9), (12, 18), (13, 20), (19, 12), (27, 29), (13, 14), (16, 14), (12, 26), (19, 30), (16, 18), (31, 27), (26, 28), (16, 1), (18, 1), (19, 28), (18, 31), (29, 4), (17, 0), (14, 1), (17, 12), (12, 15), (28, 16), (10, 1), (16, 19), (9, 27), (30, 1), (19, 27), (0, 3), (15, 11), (21, 3), (11, 19), (12, 0), (23, 11), (8, 14), (16, 8), (22, 13), (13, 3), (4, 19), (14, 15), (12, 20), (19, 16), (18, 5)])
months_circ = [(20, 17), (6, 11), (16, 0), (5, 15), (17, 11), (23, 16), (5, 25), (7, 0), (26, 14), (6, 14), (12, 22), (8, 4), (12, 15), (16, 29), (15, 25), (5, 16), (18, 31), (14, 7), (11, 18), (4, 12), (3, 19), (12, 2), (11, 28), (4, 3), (18, 9), (8, 14), (12, 3), (11, 2), (10, 13), (4, 16), (1, 22), (11, 16), (3, 15), (13, 31), (2, 4), (2, 16), (8, 13), (0, 13), (8, 15), (12, 28), (1, 5), (0, 4), (0, 25), (3, 24), (13, 11), (1, 24), (8, 16), (13, 8), (3, 26), (0, 6), (3, 23), (1, 3), (14, 3), (8, 19), (8, 12), (14, 2), (8, 5), (1, 28), (8, 20), (2, 30), (8, 6), (10, 1), (13, 20), (19, 27)]
len(months_circ)

64

In [None]:
intersect_all = list(set(nums_1to9) & set(nw_circ) & set(months_circ))
len(intersect_all)

16

In [None]:
union_all = list(set(nums_1to9) | set(nw_circ) | set(months_circ))
len(union_all)

172

# turn into fn

In [None]:
def ablate_circ_autoScore(model, circuit, sequences_as_str, next_members):
    corr_text = "5 3 9"
    list_outputs = []
    score = 0
    total_orig_logits = 0
    total_abl_logits = 0
    for clean_text, correct_ans in zip(sequences_as_str, next_members):
        # correct_ans_tokLen, orig_score = clean_gen(model, clean_text, correct_ans)
        # orig_score = clean_gen(model, clean_text, correct_ans)

        heads_not_ablate = [(layer, head) for layer in range(32) for head in range(32)]  # unablated
        head_to_remove = circuit
        heads_not_ablate = [x for x in heads_not_ablate if (x not in head_to_remove)]

        mlps_not_ablate = [layer for layer in range(32)]

        output_after_ablate, ablated_score = ablate_auto_score(model, clean_text, corr_text, heads_not_ablate, mlps_not_ablate, correct_ans)
        list_outputs.append(output_after_ablate)
        # total_orig_logits += orig_score
        # total_abl_logits += ablated_score
        print(correct_ans, output_after_ablate)
        # print(orig_score, ablated_score)
        # print('logit ratio: ', ablated_score / orig_score)
        if correct_ans == output_after_ablate:
            score += 1
    perc_score = score / len(next_members)
    # return perc_score, list_outputs, total_abl_logits / total_orig_logits
    return perc_score

In [None]:
def ablate_randcirc_autoScore(model, sequences_as_str, next_members, num_rand_runs, heads_not_overlap, num_heads_rand, num_not_overlap):
    corr_text = "5 3 9"
    # list_outputs = []
    all_scores = []
    for clean_text, correct_ans in zip(sequences_as_str, next_members):
        prompt_score = 0
        # correct_ans_tokLen = clean_gen(model, clean_text, correct_ans)
        for j in range(num_rand_runs):
            all_possible_pairs =  [(layer, head) for layer in range(32) for head in range(32)]
            filtered_pairs = [pair for pair in all_possible_pairs if pair not in heads_not_overlap] # Filter out heads_not_overlap from all_possible_pairs

            # Randomly choose num_heads_rand pairs ensuring less than num_not_overlap overlaps with heads_not_overlap
            head_to_remove = choose_heads_to_remove(filtered_pairs, heads_not_overlap, num_heads_rand, num_not_overlap)

            heads_not_ablate = [x for x in all_possible_pairs if x not in head_to_remove]

            mlps_not_ablate = [layer for layer in range(32)]

            output_after_ablate = ablate_auto_score(model, clean_text, corr_text, heads_not_ablate, mlps_not_ablate, correct_ans)
            # list_outputs.append(output_after_ablate)
            # print(correct_ans, output_after_ablate)
            if correct_ans == output_after_ablate:
                prompt_score += 1
        # print(prompt_score / num_rand_runs)
        print(clean_text)
        all_scores.append(prompt_score / num_rand_runs)

    perc_score = sum(all_scores) / len(next_members)
    return perc_score #, list_outputs

In [None]:
def ablate_randcirc_autoScore_2(model, sequences_as_str, next_members, lst_rand_head_to_remove):
    corr_text = "5 3 9"
    # list_outputs = []
    all_scores = []
    for clean_text, correct_ans in zip(sequences_as_str, next_members):
        prompt_score = 0
        # correct_ans_tokLen = clean_gen(model, clean_text, correct_ans)
        # for j in range(num_rand_runs):
        for head_to_remove in lst_rand_head_to_remove:
            # all_possible_pairs =  [(layer, head) for layer in range(32) for head in range(32)]
            # filtered_pairs = [pair for pair in all_possible_pairs if pair not in heads_not_overlap] # Filter out heads_not_overlap from all_possible_pairs

            # # Randomly choose num_heads_rand pairs ensuring less than num_not_overlap overlaps with heads_not_overlap
            # head_to_remove = choose_heads_to_remove(filtered_pairs, heads_not_overlap, num_heads_rand, num_not_overlap)

            heads_not_ablate = [x for x in all_possible_pairs if x not in head_to_remove]

            mlps_not_ablate = [layer for layer in range(32)]

            output_after_ablate = ablate_auto_score(model, clean_text, corr_text, heads_not_ablate, mlps_not_ablate, correct_ans)
            # list_outputs.append(output_after_ablate)
            # print(correct_ans, output_after_ablate)
            if correct_ans == output_after_ablate:
                prompt_score += 1
        # print(prompt_score / num_rand_runs)
        print(clean_text)
        all_scores.append(prompt_score / len(lst_rand_head_to_remove))

    perc_score = sum(all_scores) / len(next_members)
    return perc_score #, list_outputs

In [None]:
def gen_intervaled_seqs(interval, start, num_prompts):
    sequences = []
    next_members = []

    # Generate overlapping intervals
    for _ in range(num_prompts):
        sequence = [start, start + interval, start + interval*2]
        next_member = str(start + interval*3)
        sequences.append(sequence)
        next_members.append(next_member)
        start += interval  # Move to the next starting point

    sequences_as_str = [" ".join(map(str, seq)) for seq in sequences]
    sequences_as_str = [member + " " for member in sequences_as_str]
    print("Sequences:")
    print(sequences_as_str)
    print("\nNext Members:")
    print(next_members)
    return sequences_as_str, next_members

# chose rand circs

In [None]:
# Function to randomly choose 50 pairs ensuring less than 10 overlap with heads_of_circ
def choose_heads_to_remove(filtered_pairs, heads_of_circ, num_pairs=50, max_overlap=10):
    while True:
        head_to_remove = random.sample(filtered_pairs, num_pairs)
        overlap_count = len([head for head in head_to_remove if head in heads_of_circ])
        if overlap_count < max_overlap:
            return head_to_remove

In [None]:
import random
num_rand_runs = 50
lst_rand_head_to_remove = []

heads_not_overlap = intersect_all
num_heads_rand = 100
num_not_overlap = len(intersect_all)
for j in range(num_rand_runs):
    all_possible_pairs =  [(layer, head) for layer in range(32) for head in range(32)]
    filtered_pairs = [pair for pair in all_possible_pairs if pair not in heads_not_overlap] # Filter out heads_not_overlap from all_possible_pairs
    head_to_remove = choose_heads_to_remove(filtered_pairs, heads_not_overlap, num_heads_rand, num_not_overlap)
    # heads_not_ablate = [x for x in all_possible_pairs if x not in head_to_remove]
    lst_rand_head_to_remove.append(head_to_remove)

In [None]:
import pickle
from google.colab import files
with open('lst_rand_head_to_remove.pkl', 'wb') as file:
    pickle.dump(lst_rand_head_to_remove, file)
files.download('lst_rand_head_to_remove.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
for lst in lst_rand_head_to_remove:
    print(lst)

[(13, 24), (28, 4), (26, 31), (12, 14), (27, 29), (27, 4), (21, 14), (0, 13), (10, 24), (5, 27), (4, 26), (16, 27), (13, 26), (5, 9), (30, 5), (11, 9), (12, 17), (22, 10), (4, 12), (4, 6), (15, 18), (2, 26), (16, 11), (23, 31), (5, 14), (27, 23), (28, 3), (0, 18), (8, 17), (4, 11), (9, 14), (23, 19), (12, 7), (19, 29), (13, 4), (25, 27), (6, 10), (26, 17), (8, 11), (2, 27), (27, 8), (22, 2), (1, 27), (1, 3), (8, 27), (11, 10), (25, 13), (14, 21), (25, 7), (1, 16), (12, 10), (8, 28), (25, 19), (1, 11), (3, 3), (5, 11), (4, 31), (28, 8), (19, 20), (20, 23), (10, 7), (22, 17), (18, 13), (6, 26), (27, 20), (2, 4), (13, 14), (22, 27), (27, 28), (6, 16), (19, 16), (7, 10), (13, 3), (12, 15), (5, 20), (28, 24), (14, 26), (13, 21), (24, 29), (29, 29), (3, 23), (22, 6), (27, 19), (7, 31), (19, 19), (20, 0), (13, 29), (7, 17), (3, 31), (22, 8), (21, 30), (3, 21), (23, 17), (25, 4), (6, 24), (30, 28), (25, 26), (18, 23), (26, 12), (10, 17)]
[(11, 14), (7, 19), (29, 25), (21, 4), (19, 4), (27, 17)

# (+1) seq

In [None]:
interval = 1
start = 1
num_prompts = 50
sequences_as_str, next_members = gen_intervaled_seqs(interval, start, num_prompts)

Sequences:
['1 2 3 ', '2 3 4 ', '3 4 5 ', '4 5 6 ', '5 6 7 ', '6 7 8 ', '7 8 9 ', '8 9 10 ', '9 10 11 ', '10 11 12 ', '11 12 13 ', '12 13 14 ', '13 14 15 ', '14 15 16 ', '15 16 17 ', '16 17 18 ', '17 18 19 ', '18 19 20 ', '19 20 21 ', '20 21 22 ', '21 22 23 ', '22 23 24 ', '23 24 25 ', '24 25 26 ', '25 26 27 ', '26 27 28 ', '27 28 29 ', '28 29 30 ', '29 30 31 ', '30 31 32 ', '31 32 33 ', '32 33 34 ', '33 34 35 ', '34 35 36 ', '35 36 37 ', '36 37 38 ', '37 38 39 ', '38 39 40 ', '39 40 41 ', '40 41 42 ', '41 42 43 ', '42 43 44 ', '43 44 45 ', '44 45 46 ', '45 46 47 ', '46 47 48 ', '47 48 49 ', '48 49 50 ', '49 50 51 ', '50 51 52 ']

Next Members:
['4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53']


In [None]:
perc_score = ablate_circ_autoScore(model, nums_1to9, sequences_as_str, next_members)
print(perc_score)

4 1
5 4
6 5
7 7
8 1
9 1
10 10
11 10
12 10
13 11
14 11
15 14
16 14
17 15
18 16
19 17
20 18
21 19
22 10
23 10
24 11
25 10
26 10
27 20
28 11
29 10
30 20
31 20
32 29
33 31
34 13
35 34
36 33
37 35
38 37
39 33
40 39
41 39
42 41
43 40
44 43
45 44
46 45
47 46
48 47
49 48
50 48
51 50
52 50
53 51
0.04


In [None]:
perc_score = ablate_circ_autoScore(model, nw_circ, sequences_as_str, next_members)
perc_score

4 1
5 3
6 3
7 3
8 1
9 1
10 1 
11 0 
12 1 
13 1 
14 1 
15 1 
16 1 
17 10
18 10
19 10
20 10
21 10
22 1 
23 0 
24 1 
25 3 
26 3 
27 3 
28 3 
29 3 
30 10
31 10
32 1 
33 3 
34 3 
35 3 
36 3 
37 3 
38 4 
39 38
40 38
41 30
42 3 
43 0 
44 4 
45 3 
46 3 
47 4 
48 5 
49 3 
50 3 
51 10
52 1 
53 5 


0.0

In [None]:
perc_score = ablate_circ_autoScore(model, months_circ, sequences_as_str, next_members)
perc_score

4 2
5 4
6 5
7 6
8 7
9 7
10 9 
11 10
12 10
13 12
14 13
15 14
16 15
17 16
18 16
19 18
20 19
21 19
22 20
23 22
24 23
25 24
26 25
27 26
28 27
29 28
30 29
31 30
32 31
33 32
34 32
35 34
36 35
37 36
38 36
39 38
40 39
41 39
42 42
43 42
44 4 
45 44
46 45
47 46
48 47
49 48
50 49
51 50
52 51
53 52


0.02

In [None]:
num_rand_runs = 50
heads_not_overlap = intersect_all
num_heads_rand = 100
num_not_overlap = len(intersect_all)
perc_score = ablate_randcirc_autoScore(model, sequences_as_str, next_members,
                                                    num_rand_runs, heads_not_overlap, num_heads_rand, num_not_overlap)
perc_score

1 2 3 
2 3 4 
3 4 5 
4 5 6 
5 6 7 
6 7 8 
7 8 9 
8 9 10 
9 10 11 
10 11 12 
11 12 13 
12 13 14 
13 14 15 
14 15 16 
15 16 17 
16 17 18 
17 18 19 
18 19 20 
19 20 21 
20 21 22 
21 22 23 
22 23 24 
23 24 25 
24 25 26 
25 26 27 
26 27 28 
27 28 29 
28 29 30 
29 30 31 
30 31 32 
31 32 33 
32 33 34 
33 34 35 
34 35 36 
35 36 37 
36 37 38 
37 38 39 
38 39 40 
39 40 41 
40 41 42 
41 42 43 
42 43 44 
43 44 45 
44 45 46 
45 46 47 
46 47 48 
47 48 49 
48 49 50 
49 50 51 
50 51 52 


0.9699999999999998

# (+2) seq

In [None]:
interval = 2
start = 2
num_prompts = 50
sequences_as_str, next_members = gen_intervaled_seqs(interval, start, num_prompts)

Sequences:
['2 4 6 ', '4 6 8 ', '6 8 10 ', '8 10 12 ', '10 12 14 ', '12 14 16 ', '14 16 18 ', '16 18 20 ', '18 20 22 ', '20 22 24 ', '22 24 26 ', '24 26 28 ', '26 28 30 ', '28 30 32 ', '30 32 34 ', '32 34 36 ', '34 36 38 ', '36 38 40 ', '38 40 42 ', '40 42 44 ', '42 44 46 ', '44 46 48 ', '46 48 50 ', '48 50 52 ', '50 52 54 ', '52 54 56 ', '54 56 58 ', '56 58 60 ', '58 60 62 ', '60 62 64 ', '62 64 66 ', '64 66 68 ', '66 68 70 ', '68 70 72 ', '70 72 74 ', '72 74 76 ', '74 76 78 ', '76 78 80 ', '78 80 82 ', '80 82 84 ', '82 84 86 ', '84 86 88 ', '86 88 90 ', '88 90 92 ', '90 92 94 ', '92 94 96 ', '94 96 98 ', '96 98 100 ', '98 100 102 ', '100 102 104 ']

Next Members:
['8', '10', '12', '14', '16', '18', '20', '22', '24', '26', '28', '30', '32', '34', '36', '38', '40', '42', '44', '46', '48', '50', '52', '54', '56', '58', '60', '62', '64', '66', '68', '70', '72', '74', '76', '78', '80', '82', '84', '86', '88', '90', '92', '94', '96', '98', '100', '102', '104', '106']


In [None]:
perc_score = ablate_circ_autoScore(model, nums_1to9, sequences_as_str, next_members)
print(perc_score)

8 6
10 10
12 10
14 10
16 10
18 14
20 16
22 18
24 10
26 10
28 10
30 20
32 10
34 20
36 33
38 34
40 36
42 40
44 40
46 44
48 46
50 48
52 50
54 50
56 50
58 56
60 56
62 58
64 60
66 64
68 64
70 70
72 70
74 70
76 70
78 74
80 76
82 78
84 82
86 82
88 86
90 88
92 88
94 88
96 92
98 94
100 97 
102 100
104 100
106 100
0.04


In [None]:
perc_score = ablate_circ_autoScore(model, nw_circ, sequences_as_str, next_members)
perc_score

8 2
10 1 
12 0 
14 1 
16 1 
18 10
20 10
22 10
24 1 
26 0 
28 2 
30 10
32 10
34 30
36 3 
38 32
40 36
42 3 
44 3 
46 0 
48 4 
50 4 
52 10
54 10
56 0 
58 5 
60 4 
62 10
64 10
66 60
68 6 
70 6 
72 10
74 10
76 70
78 7 
80 7 
82 10
84 80
86 80
88 84
90 86
92 88
94 88
96 90
98 94
100 94 
102 000
104 000
106 000


0.0

In [None]:
perc_score = ablate_circ_autoScore(model, months_circ, sequences_as_str, next_members)
perc_score

8 6
10 6 
12 10
14 10
16 14
18 16
20 16
22 18
24 20
26 24
28 26
30 28
32 30
34 32
36 34
38 36
40 36
42 38
44 43
46 44
48 46
50 48
52 50
54 52
56 54
58 56
60 56
62 60
64 60
66 64
68 66
70 68
72 68
74 72
76 75
78 76
80 76
82 78
84 80
86 84
88 86
90 86
92 90
94 92
96 94
98 96
100 96 
102 10 
104 102
106 104


0.0

In [None]:
num_rand_runs = 50
heads_not_overlap = intersect_all
num_heads_rand = 100
num_not_overlap = len(intersect_all)
perc_score = ablate_randcirc_autoScore(model, sequences_as_str, next_members,
                                                    num_rand_runs, heads_not_overlap, num_heads_rand, num_not_overlap)
perc_score

2 4 6 
4 6 8 
6 8 10 
8 10 12 
10 12 14 
12 14 16 
14 16 18 
16 18 20 
18 20 22 
20 22 24 
22 24 26 
24 26 28 
26 28 30 
28 30 32 
30 32 34 
32 34 36 
34 36 38 
36 38 40 
38 40 42 
40 42 44 
42 44 46 
44 46 48 
46 48 50 
48 50 52 
50 52 54 
52 54 56 
54 56 58 
56 58 60 
58 60 62 
60 62 64 
62 64 66 
64 66 68 
66 68 70 
68 70 72 
70 72 74 
72 74 76 
74 76 78 
76 78 80 
78 80 82 
80 82 84 
82 84 86 
84 86 88 
86 88 90 
88 90 92 
90 92 94 
92 94 96 
94 96 98 
96 98 100 
98 100 102 
100 102 104 


0.9239999999999998

# (+3) seq

In [None]:
interval = 3
start = 3
num_prompts = 50
sequences_as_str, next_members = gen_intervaled_seqs(interval, start, num_prompts)

Sequences:
['3 6 9 ', '6 9 12 ', '9 12 15 ', '12 15 18 ', '15 18 21 ', '18 21 24 ', '21 24 27 ', '24 27 30 ', '27 30 33 ', '30 33 36 ', '33 36 39 ', '36 39 42 ', '39 42 45 ', '42 45 48 ', '45 48 51 ', '48 51 54 ', '51 54 57 ', '54 57 60 ', '57 60 63 ', '60 63 66 ', '63 66 69 ', '66 69 72 ', '69 72 75 ', '72 75 78 ', '75 78 81 ', '78 81 84 ', '81 84 87 ', '84 87 90 ', '87 90 93 ', '90 93 96 ', '93 96 99 ', '96 99 102 ', '99 102 105 ', '102 105 108 ', '105 108 111 ', '108 111 114 ', '111 114 117 ', '114 117 120 ', '117 120 123 ', '120 123 126 ', '123 126 129 ', '126 129 132 ', '129 132 135 ', '132 135 138 ', '135 138 141 ', '138 141 144 ', '141 144 147 ', '144 147 150 ', '147 150 153 ', '150 153 156 ']

Next Members:
['12', '15', '18', '21', '24', '27', '30', '33', '36', '39', '42', '45', '48', '51', '54', '57', '60', '63', '66', '69', '72', '75', '78', '81', '84', '87', '90', '93', '96', '99', '102', '105', '108', '111', '114', '117', '120', '123', '126', '129', '132', '135', '138', '14

In [None]:
perc_score = ablate_circ_autoScore(model, nums_1to9, sequences_as_str, next_members)
print(perc_score)

12 10
15 12
18 12
21 15
24 18
27 11
30 10
33 10
36 27
39 33
42 33
45 39
48 45
51 45
54 48
57 48
60 54
63 60
66 53
69 63
72 66
75 72
78 75
81 78
84 10
87 77
90 87
93 90
96 87
99 93
102 96 
105 101
108 105
111 100
114 111
117 111
120 111
123 111
126 10 
129 123
132 126
135 129
138 135
141 133
144 13 
147 141
150 147
153 144
156 153
159 153
0.0


In [None]:
perc_score = ablate_circ_autoScore(model, nw_circ, sequences_as_str, next_members)
perc_score

12 1 
15 1 
18 1 
21 10
24 1 
27 1 
30 1 
33 10
36 10
39 30
42 3 
45 3 
48 3 
51 4 
54 1 
57 3 
60 1 
63 10
66 10
69 63
72 10
75 10
78 3 
81 7 
84 1 
87 8 
90 81
93 84
96 10
99 90
102 96 
105 000
108 000
111 000
114 005
117 08 
120 111
123 100
126 17 
129 10 
132 10 
135 10 
138 132
141 132
144 10 
147 141
150 141
153 144
156 10 
159 150


0.0

In [None]:
perc_score = ablate_circ_autoScore(model, months_circ, sequences_as_str, next_members)
perc_score

12 3 
15 12
18 15
21 18
24 18
27 24
30 27
33 20
36 30
39 36
42 36
45 39
48 45
51 48
54 51
57 54
60 57
63 57
66 60
69 63
72 69
75 6 
78 75
81 75
84 78
87 84
90 88
93 90
96 93
99 93
102 96 
105 9 9
108 105
111 108
114 10 
117 14 
120 117
123 17 
126 123
129 126
132 129
135 132
138 135
141 138
144 141
147 144
150 147
153 150
156 15 
159 156


0.0

In [None]:
num_rand_runs = 50
heads_not_overlap = intersect_all
num_heads_rand = 100
num_not_overlap = len(intersect_all)
perc_score = ablate_randcirc_autoScore(model, sequences_as_str, next_members,
                                                    num_rand_runs, heads_not_overlap, num_heads_rand, num_not_overlap)
perc_score

3 6 9 
6 9 12 
9 12 15 
12 15 18 
15 18 21 
18 21 24 
21 24 27 
24 27 30 
27 30 33 
30 33 36 
33 36 39 
36 39 42 
39 42 45 
42 45 48 
45 48 51 
48 51 54 
51 54 57 
54 57 60 
57 60 63 
60 63 66 
63 66 69 
66 69 72 
69 72 75 
72 75 78 
75 78 81 
78 81 84 
81 84 87 
84 87 90 
87 90 93 
90 93 96 
93 96 99 
96 99 102 
99 102 105 
102 105 108 
105 108 111 
108 111 114 
111 114 117 
114 117 120 
117 120 123 
120 123 126 
123 126 129 
126 129 132 
129 132 135 
132 135 138 
135 138 141 
138 141 144 
141 144 147 
144 147 150 
147 150 153 
150 153 156 


0.7240000000000002

# (+10) seq

In [None]:
interval = 10
start = 0
num_prompts = 50
sequences_as_str, next_members = gen_intervaled_seqs(interval, start, num_prompts)

Sequences:
['0 10 20 ', '10 20 30 ', '20 30 40 ', '30 40 50 ', '40 50 60 ', '50 60 70 ', '60 70 80 ', '70 80 90 ', '80 90 100 ', '90 100 110 ', '100 110 120 ', '110 120 130 ', '120 130 140 ', '130 140 150 ', '140 150 160 ', '150 160 170 ', '160 170 180 ', '170 180 190 ', '180 190 200 ', '190 200 210 ', '200 210 220 ', '210 220 230 ', '220 230 240 ', '230 240 250 ', '240 250 260 ', '250 260 270 ', '260 270 280 ', '270 280 290 ', '280 290 300 ', '290 300 310 ', '300 310 320 ', '310 320 330 ', '320 330 340 ', '330 340 350 ', '340 350 360 ', '350 360 370 ', '360 370 380 ', '370 380 390 ', '380 390 400 ', '390 400 410 ', '400 410 420 ', '410 420 430 ', '420 430 440 ', '430 440 450 ', '440 450 460 ', '450 460 470 ', '460 470 480 ', '470 480 490 ', '480 490 500 ', '490 500 510 ']

Next Members:
['30', '40', '50', '60', '70', '80', '90', '100', '110', '120', '130', '140', '150', '160', '170', '180', '190', '200', '210', '220', '230', '240', '250', '260', '270', '280', '290', '300', '310', '320

In [None]:
perc_score = ablate_circ_autoScore(model, nums_1to9, sequences_as_str, next_members)
print(perc_score)

30 10
40 10
50 10
60 00
70 70
80 10
90 10
100 100
110 100
120 100
130 100
140 100
150 100
160 100
170 150
180 100
190 100
200 100
210 100
220 100
230 100
240 100
250 100
260 230
270 260
280 260
290 260
300 270
310 280
320 290
330 310
340 30 
350 330
360 300
370 350
380 333
390 333
400 333
410 000
420 100
430 400
440 430
450 440
460 450
470 460
480 470
490 480
500 480
510 480
520 500
0.04


In [None]:
perc_score = ablate_circ_autoScore(model, nw_circ, sequences_as_str, next_members)
perc_score

30 0 
40 0 
50 30
60 40
70 50
80 60
90 70
100 80 
110 000
120 000
130 000
140 10 
150 140
160 10 
170 160
180 10 
190 180
200 10 
210 100
220 000
230 00 
240 20 
250 20 
260 30 
270 30 
280 30 
290 280
300 20 
310 200
320 300
330 30 
340 320
350 340
360 350
370 360
380 370
390 380
400 390
410 300
420 400
430 40 
440 430
450 430
460 450
470 460
480 470
490 480
500 490
510 40 
520 500


0.0

In [None]:
perc_score = ablate_circ_autoScore(model, months_circ, sequences_as_str, next_members)
perc_score

30 10
40 30
50 40
60 50
70 60
80 70
90 70
100 90 
110 10 
120 10 
130 10 
140 10 
150 140
160 10 
170 160
180 160
190 180
200 180
210 190
220 20 
230 202
240 20 
250 24 
260 25 
270 260
280 270
290 280
300 290
310 30 
320 30 
330 30 
340 30 
350 30 
360 35 
370 30 
380 36 
390 38 
400 39 
410 30 
420 41 
430 43 
440 43 
450 440
460 45 
470 46 
480 48 
490 48 
500 490
510 50 
520 50 


0.0

In [None]:
num_rand_runs = 50
heads_not_overlap = intersect_all
num_heads_rand = 100
num_not_overlap = len(intersect_all)
perc_score = ablate_randcirc_autoScore(model, sequences_as_str, next_members,
                                                    num_rand_runs, heads_not_overlap, num_heads_rand, num_not_overlap)
perc_score

0 10 20 
10 20 30 
20 30 40 
30 40 50 
40 50 60 
50 60 70 
60 70 80 
70 80 90 
80 90 100 
90 100 110 
100 110 120 
110 120 130 
120 130 140 
130 140 150 
140 150 160 
150 160 170 
160 170 180 
170 180 190 
180 190 200 
190 200 210 
200 210 220 
210 220 230 
220 230 240 
230 240 250 
240 250 260 
250 260 270 
260 270 280 
270 280 290 
280 290 300 
290 300 310 
300 310 320 
310 320 330 
320 330 340 
330 340 350 
340 350 360 
350 360 370 
360 370 380 
370 380 390 
380 390 400 
390 400 410 
400 410 420 
410 420 430 
420 430 440 
430 440 450 
440 450 460 
450 460 470 
460 470 480 
470 480 490 
480 490 500 
490 500 510 


0.9559999999999998

# (+100) seq

In [None]:
interval = 100
start = 0
num_prompts = 50
sequences_as_str, next_members = gen_intervaled_seqs(interval, start, num_prompts)

Sequences:
['0 100 200 ', '100 200 300 ', '200 300 400 ', '300 400 500 ', '400 500 600 ', '500 600 700 ', '600 700 800 ', '700 800 900 ', '800 900 1000 ', '900 1000 1100 ', '1000 1100 1200 ', '1100 1200 1300 ', '1200 1300 1400 ', '1300 1400 1500 ', '1400 1500 1600 ', '1500 1600 1700 ', '1600 1700 1800 ', '1700 1800 1900 ', '1800 1900 2000 ', '1900 2000 2100 ', '2000 2100 2200 ', '2100 2200 2300 ', '2200 2300 2400 ', '2300 2400 2500 ', '2400 2500 2600 ', '2500 2600 2700 ', '2600 2700 2800 ', '2700 2800 2900 ', '2800 2900 3000 ', '2900 3000 3100 ', '3000 3100 3200 ', '3100 3200 3300 ', '3200 3300 3400 ', '3300 3400 3500 ', '3400 3500 3600 ', '3500 3600 3700 ', '3600 3700 3800 ', '3700 3800 3900 ', '3800 3900 4000 ', '3900 4000 4100 ', '4000 4100 4200 ', '4100 4200 4300 ', '4200 4300 4400 ', '4300 4400 4500 ', '4400 4500 4600 ', '4500 4600 4700 ', '4600 4700 4800 ', '4700 4800 4900 ', '4800 4900 5000 ', '4900 5000 5100 ']

Next Members:
['300', '400', '500', '600', '700', '800', '900', '1

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, nums_1to9, sequences_as_str, next_members)
perc_score

300 000
400 100
500 100
600 000
700 100


0.0

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, nw_circ, sequences_as_str, next_members)
perc_score

300 000
400 000
500 300
600 300
700 500


0.0

In [None]:
perc_score, list_outputs = ablate_circ_autoScore(model, months_circ, sequences_as_str, next_members)
perc_score

300 101
400 303
500 404
600 505
700 606


0.0

In [None]:
num_rand_runs = 50
heads_not_overlap = intersect_all
num_heads_rand = 100
num_not_overlap = len(intersect_all)
perc_score = ablate_randcirc_autoScore(model, sequences_as_str, next_members,
                                                    num_rand_runs, heads_not_overlap, num_heads_rand, num_not_overlap)
perc_score

0 100 200 
100 200 300 
200 300 400 
300 400 500 
400 500 600 
500 600 700 
600 700 800 
700 800 900 
800 900 1000 
900 1000 1100 
1000 1100 1200 
1100 1200 1300 
1200 1300 1400 
1300 1400 1500 
1400 1500 1600 
1500 1600 1700 
1600 1700 1800 
1700 1800 1900 
1800 1900 2000 
1900 2000 2100 
2000 2100 2200 
2100 2200 2300 
2200 2300 2400 
2300 2400 2500 
2400 2500 2600 
2500 2600 2700 
2600 2700 2800 
2700 2800 2900 
2800 2900 3000 
2900 3000 3100 
3000 3100 3200 
3100 3200 3300 
3200 3300 3400 
3300 3400 3500 
3400 3500 3600 
3500 3600 3700 
3600 3700 3800 
3700 3800 3900 
3800 3900 4000 
3900 4000 4100 
4000 4100 4200 
4100 4200 4300 
4200 4300 4400 
4300 4400 4500 
4400 4500 4600 
4500 4600 4700 
4600 4700 4800 
4700 4800 4900 
4800 4900 5000 
4900 5000 5100 


0.9119999999999996