# Setup

In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:

# This is for if we're trying to execute on a remote JupyterHub, where the pwd is set to the server root, or else I think pwd is set correctly already.
# %cd CD_Circuit/

import argparse
import numpy as np
import os
import pandas as pd
import scipy as sp
import sys
import torch
import torch.nn.functional as F
import warnings
import random
import collections

# CD-T Imports
import math
import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import itertools
import operator

from torch import nn

warnings.filterwarnings("ignore")

base_dir = os.path.split(os.getcwd())[0]
sys.path.append(base_dir)

from argparse import Namespace
from methods.bag_of_ngrams.processing import cleanReports, cleanSplit, stripChars
from pyfunctions.general import extractListFromDic, readJson, combine_token_attn, compute_word_intervals, compare_same
from pyfunctions.pathology import extract_synoptic, fixLabelProstateGleason, fixProstateLabels, fixLabel, exclude_labels
from pyfunctions.cdt_basic import *
from pyfunctions.cdt_source_to_target import *
from pyfunctions.ioi_dataset import IOIDataset
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AutoTokenizer, AutoModel
from transformers import GPT2Tokenizer, GPT2Model
from pyfunctions.wrappers import Node, AblationSet

In [2]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
torch.autograd.set_grad_enabled(False)
# Model code adapted from Callum McDougall's notebook for ARENA on reproducing the IOI paper using TransformerLens.
# This makes some sense, since EasyTransformer, the repo/lib released by the IOI guys, was forked from TransformerLens.
# In fact, this makes the reproduction a little bit more faithful, since they most likely do certain things such as 
# "folding" LayerNorms to improve their interpretability results, and we are able to do the same by using TransformerLens.
# HuggingFace, by contrast, has the most impenetrable docs and tons of outdated APIs and etc.; even their source 
# code is impossible to traverse, and I gave up on it, thankfully quickly.

from transformer_lens import utils, HookedTransformer, ActivationCache
model = HookedTransformer.from_pretrained("gpt2-small",
                                          center_unembed=True,
                                          center_writing_weights=True,
                                          fold_ln=False,
                                          refactor_factored_attn_matrices=True)
                                          

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Loaded pretrained model gpt2-small into HookedTransformer


In [3]:
from pyfunctions.ioi_dataset import IOIDataset

# Generate a dataset all consisting of one template, randomly chosen.
# nb_templates = 2 due to some logic internal to IOIDataset:
# essentially, the nouns can be an ABBA or ABAB order and that counts as separate templates.
ioi_dataset = IOIDataset(prompt_type="mixed", N=3, tokenizer=model.tokenizer, prepend_bos=False, nb_templates=2)

# This is the P_ABC that is mentioned in the IOI paper, which we use for mean ablation.
# Importantly, passing in prompt_type="ABC" or similar is NOT the same thing as this.
abc_dataset = (
    ioi_dataset.gen_flipped_prompts(("IO", "RAND"))
    .gen_flipped_prompts(("S", "RAND"))
    .gen_flipped_prompts(("S1", "RAND"))
)

logits, cache = model.run_with_cache(abc_dataset.toks) # run on entire dataset along batch dimension

attention_outputs = [cache['blocks.' + str(i) + '.attn.hook_z'] for i in range(12)]
attention_outputs = torch.stack(attention_outputs, dim=1) # now batch, layer, seq, n_heads, dim_attn
mean_acts = torch.mean(attention_outputs, dim=0)
old_shape = mean_acts.shape
last_dim = old_shape[-2] * old_shape[-1]
new_shape = old_shape[:-2] + (last_dim,)
mean_acts = mean_acts.view(new_shape)
mean_acts.shape

2024-09-26 02:17:44.339763: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-26 02:17:46.323034: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


torch.Size([12, 16, 768])

In [4]:

# source_list = [Node(0, 0, 0), Node(1, 1, 1)]
# target_nodes = [(7, 0, 1)]

text = ioi_dataset.sentences[0]
encoding = model.tokenizer.encode_plus(text, 
                                 add_special_tokens=True, 
                                 max_length=512,
                                 truncation=True, 
                                 padding = "longest", 
                                 return_attention_mask=True, 
                                 return_tensors="pt").to(device)
encoding_idxs, attention_mask = encoding.input_ids, encoding.attention_mask
input_shape = encoding_idxs.size()
extended_attention_mask = get_extended_attention_mask(attention_mask, 
                                                        input_shape, 
                                                        model,
                                                        device)
# out_decomps, target_decomps, _ = prop_GPT(encoding_idxs, extended_attention_mask, model, source_list, target_nodes, mean_acts=mean_acts, set_irrel_to_mean=True, device=device)

# Circuit evaluation

In [5]:
from pyfunctions.faithfulness_ablations import logits_to_ave_logit_diff_2, add_mean_ablation_hook
ioi_dataset = IOIDataset(prompt_type="mixed", N=100, tokenizer=model.tokenizer, prepend_bos=False)
abc_dataset = (
    ioi_dataset.gen_flipped_prompts(("IO", "RAND"))
    .gen_flipped_prompts(("S", "RAND"))
    .gen_flipped_prompts(("S1", "RAND"))
)

In [6]:
model.reset_hooks(including_permanent=True)
logits, cache = model.run_with_cache(ioi_dataset.toks) # run on entire dataset along batch dimension
ave_logit_diff = logits_to_ave_logit_diff_2(logits, ioi_dataset)
print(ave_logit_diff)

tensor(3.6853, device='cuda:0')


In [7]:
model = add_mean_ablation_hook(model, means_dataset=abc_dataset) # IOI paper's circuit, by default
logits, cache = model.run_with_cache(ioi_dataset.toks) # run on entire dataset along batch dimension
ave_logit_diff = logits_to_ave_logit_diff_2(logits, ioi_dataset)
print(ave_logit_diff)

tensor(3.2721, device='cuda:0')


In [8]:
'''
This one does better than the full model very consistently, which is kind of alarming.
Wasn't ablating away the IOI dataset information supposed to destroy information, thereby resulting in a worse performance?
'''

model.reset_hooks(including_permanent=True)
model = add_mean_ablation_hook(model, means_dataset=ioi_dataset)
logits, cache = model.run_with_cache(ioi_dataset.toks) # run on entire dataset along batch dimension
ave_logit_diff = logits_to_ave_logit_diff_2(logits, ioi_dataset)
print(ave_logit_diff)

tensor(4.0973, device='cuda:0')


In [None]:
# Check implementation of explicit index
CIRCUIT = {
    "name mover": [(9, 9), (10, 0), (9, 6)],
    "backup name mover": [(10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (9, 7), (9, 0), (11, 9)],
    "negative name mover": [(10, 7), (11, 10)],
    "s2 inhibition": [(7, 3), (7, 9), (8, 6), (8, 10)],
    "induction": [(5, 5), (5, 8), (5, 9), (6, 9)],
    "duplicate token": [(0, 1), (0, 10), (3, 0)],
    "previous token": [(2, 2), (4, 11)],
}

SEQ_POS_TO_KEEP = {
    "name mover": "end",
    "backup name mover": "end",
    "negative name mover": "end",
    "s2 inhibition": "end",
    "induction": "S2",
    "duplicate token": "S2",
    "previous token": "S+1",
}
nodes = []
for key in CIRCUIT:
    explicit_seq_pos = ioi_dataset.word_idx[SEQ_POS_TO_KEEP[key]].numpy()[0]
    for tup in CIRCUIT[key]:
        nodes.append(Node(tup[0], explicit_seq_pos, tup[1]))

# you can now run this circuit instead of the IOI circuit in the usual way on ioi_dataset[0] and observe that it achieves the same performance

In [10]:

from pyfunctions.ioi_dataset import ABC_TEMPLATES, BAC_TEMPLATES, BABA_TEMPLATES, BABA_LONG_TEMPLATES, BABA_LATE_IOS, BABA_EARLY_IOS, ABBA_TEMPLATES, ABBA_LATE_IOS, ABBA_EARLY_IOS
template = BABA_TEMPLATES[0]

ioi_dataset = IOIDataset(prompt_type=[template], N=100, tokenizer=model.tokenizer, prepend_bos=False)
abc_dataset = (
    ioi_dataset.gen_flipped_prompts(("IO", "RAND"))
    .gen_flipped_prompts(("S", "RAND"))
    .gen_flipped_prompts(("S1", "RAND"))
)

# Perform analysis only for the template that this thing was formed on
circuit = [Node(layer_idx=8, sequence_idx=14, attn_head_idx=6),
           Node(layer_idx=8, sequence_idx=11, attn_head_idx=6),
           Node(layer_idx=9, sequence_idx=14, attn_head_idx=9),
           Node(layer_idx=9, sequence_idx=14, attn_head_idx=6),
           Node(layer_idx=5, sequence_idx=10, attn_head_idx=5),
           Node(layer_idx=7, sequence_idx=11, attn_head_idx=9),
           Node(layer_idx=6, sequence_idx=10, attn_head_idx=9),
           Node(layer_idx=6, sequence_idx=11, attn_head_idx=0),
           Node(layer_idx=5, sequence_idx=10, attn_head_idx=9),
           Node(layer_idx=3, sequence_idx=10, attn_head_idx=0),
           Node(layer_idx=4, sequence_idx=5, attn_head_idx=11),
           Node(layer_idx=3, sequence_idx=5, attn_head_idx=7),
           Node(layer_idx=3, sequence_idx=3, attn_head_idx=6),
           Node(layer_idx=2, sequence_idx=3, attn_head_idx=2),
           Node(layer_idx=2, sequence_idx=3, attn_head_idx=9),
           Node(layer_idx=1, sequence_idx=3, attn_head_idx=7),
           Node(layer_idx=1, sequence_idx=3, attn_head_idx=10),
           Node(layer_idx=0, sequence_idx=2, attn_head_idx=1),
           Node(layer_idx=0, sequence_idx=2, attn_head_idx=4)]

model.reset_hooks(including_permanent=True)
model = add_mean_ablation_hook(model, means_dataset=abc_dataset, circuit=circuit)
logits, cache = model.run_with_cache(ioi_dataset.toks) # run on entire dataset along batch dimension
ave_logit_diff = logits_to_ave_logit_diff_2(logits, ioi_dataset)
print(ave_logit_diff)

tensor(-1.1722, device='cuda:0')


In [17]:

from pyfunctions.ioi_dataset import ABC_TEMPLATES, BAC_TEMPLATES, BABA_TEMPLATES, BABA_LONG_TEMPLATES, BABA_LATE_IOS, BABA_EARLY_IOS, ABBA_TEMPLATES, ABBA_LATE_IOS, ABBA_EARLY_IOS
template = ABBA_TEMPLATES[0]

ioi_dataset = IOIDataset(prompt_type=[template], N=100, tokenizer=model.tokenizer, prepend_bos=False)
abc_dataset = (
    ioi_dataset.gen_flipped_prompts(("IO", "RAND"))
    .gen_flipped_prompts(("S", "RAND"))
    .gen_flipped_prompts(("S1", "RAND"))
)

# Perform analysis only for the template that this thing was formed on
circuit = [Node(9, 14, 9), Node(9, 14, 6), Node(10, 14, 0),
          Node(8, 14, 10), Node(7, 14, 9), Node(7, 14, 3),
          Node(5, 10, 5), Node(5, 10, 8), Node(5, 10, 9),
          Node(0, 10, 10), Node(0, 10, 1), Node(3, 10, 0),
          Node(0, 5, 6), Node(0, 5, 7), Node(0, 5, 10),
]

model.reset_hooks(including_permanent=True)
model = add_mean_ablation_hook(model, means_dataset=abc_dataset, circuit=circuit)
logits, cache = model.run_with_cache(ioi_dataset.toks) # run on entire dataset along batch dimension
ave_logit_diff = logits_to_ave_logit_diff_2(logits, ioi_dataset)
print(ave_logit_diff)

tensor(2.9411, device='cuda:0')


In [76]:

from pyfunctions.ioi_dataset import ABC_TEMPLATES, BAC_TEMPLATES, BABA_TEMPLATES, BABA_LONG_TEMPLATES, BABA_LATE_IOS, BABA_EARLY_IOS, ABBA_TEMPLATES, ABBA_LATE_IOS, ABBA_EARLY_IOS
template = BABA_TEMPLATES[0]

ioi_dataset = IOIDataset(prompt_type=[template], N=100, tokenizer=model.tokenizer, prepend_bos=False)
abc_dataset = (
    ioi_dataset.gen_flipped_prompts(("IO", "RAND"))
    .gen_flipped_prompts(("S", "RAND"))
    .gen_flipped_prompts(("S1", "RAND"))
)

# Perform analysis only for the template that this thing was formed on
circuit = [Node(layer_idx=8, sequence_idx=14, attn_head_idx=6),
           Node(layer_idx=8, sequence_idx=11, attn_head_idx=6),
           Node(layer_idx=9, sequence_idx=14, attn_head_idx=9),
           Node(layer_idx=9, sequence_idx=14, attn_head_idx=6),
           Node(layer_idx=5, sequence_idx=10, attn_head_idx=5),
           Node(layer_idx=7, sequence_idx=11, attn_head_idx=9),
           Node(layer_idx=6, sequence_idx=10, attn_head_idx=9),
           Node(layer_idx=6, sequence_idx=11, attn_head_idx=0),
           Node(layer_idx=5, sequence_idx=10, attn_head_idx=9),
           Node(layer_idx=3, sequence_idx=10, attn_head_idx=0),
           Node(layer_idx=4, sequence_idx=5, attn_head_idx=11),
           Node(layer_idx=3, sequence_idx=5, attn_head_idx=7),
           Node(layer_idx=3, sequence_idx=3, attn_head_idx=6),
           Node(layer_idx=2, sequence_idx=3, attn_head_idx=2),
           Node(layer_idx=2, sequence_idx=3, attn_head_idx=9),
           Node(layer_idx=1, sequence_idx=3, attn_head_idx=7),
           Node(layer_idx=1, sequence_idx=3, attn_head_idx=10),
           Node(layer_idx=0, sequence_idx=2, attn_head_idx=1),
           Node(layer_idx=0, sequence_idx=2, attn_head_idx=4)]

model.reset_hooks(including_permanent=True)
model = add_mean_ablation_hook(model, means_dataset=ioi_dataset, circuit=circuit)
logits, cache = model.run_with_cache(ioi_dataset.toks) # run on entire dataset along batch dimension
ave_logit_diff = logits_to_ave_logit_diff_2(logits, ioi_dataset)
print(ave_logit_diff)

b
tensor(4.1236, device='cuda:0')


In [73]:
print(len(circuit)) # IOI paper uses 26

19


In [74]:
circuit = [Node(layer_idx=9, sequence_idx=14, attn_head_idx=9),
            Node(layer_idx=9, sequence_idx=14, attn_head_idx=6),
            Node(layer_idx=10, sequence_idx=14, attn_head_idx=0),
            Node(layer_idx=9, sequence_idx=11, attn_head_idx=9),
            Node(layer_idx=9, sequence_idx=11, attn_head_idx=6),
            Node(layer_idx=9, sequence_idx=9, attn_head_idx=6),
            Node(layer_idx=8, sequence_idx=3, attn_head_idx=6),
            Node(layer_idx=8, sequence_idx=9, attn_head_idx=3),
            Node(layer_idx=8, sequence_idx=3, attn_head_idx=10),
            Node(layer_idx=7, sequence_idx=3, attn_head_idx=9),
            Node(layer_idx=7, sequence_idx=3, attn_head_idx=3),
            Node(layer_idx=6, sequence_idx=3, attn_head_idx=4),
            Node(layer_idx=6, sequence_idx=3, attn_head_idx=1),
            Node(layer_idx=6, sequence_idx=2, attn_head_idx=4),
            Node(layer_idx=5, sequence_idx=3, attn_head_idx=10),
            Node(layer_idx=5, sequence_idx=2, attn_head_idx=10),
            Node(layer_idx=4, sequence_idx=3, attn_head_idx=3),
            Node(layer_idx=4, sequence_idx=3, attn_head_idx=11),
            Node(layer_idx=4, sequence_idx=2, attn_head_idx=4),
            Node(layer_idx=4, sequence_idx=2, attn_head_idx=7),
            Node(layer_idx=4, sequence_idx=2, attn_head_idx=3),
            Node(layer_idx=3, sequence_idx=2, attn_head_idx=5),
            Node(layer_idx=3, sequence_idx=2, attn_head_idx=6),
            Node(layer_idx=3, sequence_idx=2, attn_head_idx=2),
            Node(layer_idx=2, sequence_idx=2, attn_head_idx=10),
            Node(layer_idx=2, sequence_idx=2, attn_head_idx=7),
            Node(layer_idx=2, sequence_idx=2, attn_head_idx=1),
            Node(layer_idx=1, sequence_idx=2, attn_head_idx=7),
            Node(layer_idx=1, sequence_idx=2, attn_head_idx=6),
            Node(layer_idx=1, sequence_idx=2, attn_head_idx=3),
            Node(layer_idx=0, sequence_idx=2, attn_head_idx=1),
            Node(layer_idx=0, sequence_idx=2, attn_head_idx=4),
            Node(layer_idx=0, sequence_idx=2, attn_head_idx=5)]
model.reset_hooks(including_permanent=True)
model = add_mean_ablation_hook(model, means_dataset=ioi_dataset, circuit=circuit)
logits, cache = model.run_with_cache(ioi_dataset.toks) # run on entire dataset along batch dimension
ave_logit_diff = logits_to_ave_logit_diff_2(logits, ioi_dataset)
print(len(circuit)) # IOI paper uses 26
print(ave_logit_diff)

b
tensor(1.6466, device='cuda:0')


In [67]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
print(t)
print(r)
print(a)

11546394624
6150946816
3359564288
