In [1]:
# Server
import os 
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
os.environ["TRANSFORMERS_CACHE"]= "/ceph/aboehret/cache/transformers"
os.environ["HF_DATASETS_CACHE"]= "/ceph/aboehret/cache/datasets"

# 1. Imports

In [5]:
import numpy as np
import torch as t
from nnsight import LanguageModel
import random
import json

# saprmarks/feature-circuits
from feature_circuits.activation_utils import SparseAct
from feature_circuits.attribution import patching_effect
from feature_circuits.loading_utils import load_examples

# saprmarks/dictionary-learning
from feature_circuits.dictionary_learning import AutoEncoder

# GPT
from sae_lens.sae import SAE

# Feature Selection
from feature_selection import get_thres_features, get_topk_features, get_all_features

# Neuronpedia
import json
import urllib.parse
import webbrowser

In [6]:
DEVICE = t.device("cuda" if t.cuda.is_available() else "cpu")

# 2. LLMs, SAEs, Data

## 2.1 LLM

In [7]:
# Load LLM
model_id = "openai-community/gpt2"
gpt_model = LanguageModel(
    model_id,
    device_map = DEVICE,
    dispatch = True,
)
D_GPT = gpt_model.config.hidden_size

## 2.2 SAE

In [8]:
# Load SAE

# Submodules
resids_gpt = [layer for layer in gpt_model.transformer.h]
submodules_gpt = resids_gpt

# Initialize dictionaries and submodule names
dictionaries_gpt = {}
submodule_names_gpt = {}

# Loop through the layers (adjust the range according to the number of layers)
for layer in range(12):  # Assuming 12 layers for GPT-2 small
    
    # Load the Sparse AutoEncoder (SAE) for the residual stream
    sae, original_cfg_dict, sparsity = SAE.from_pretrained(
        release="gpt2-small-res-jb",
        sae_id=f"blocks.{layer}.hook_resid_pre",  # For residual stream only
        device="cuda:0",
    )

    # Store the SAE in dictionaries for the residual layer
    dictionaries_gpt[resids_gpt[layer]] = sae
    submodule_names_gpt[resids_gpt[layer]] = f'resid_{layer}'

D_GPT_SAE = original_cfg_dict['d_sae'] 


This SAE has non-empty model_from_pretrained_kwargs. 
For optimal performance, load the model like so:
model = HookedSAETransformer.from_pretrained_no_processing(..., **cfg.model_from_pretrained_kwargs)


In [9]:
D_GPT_SAE

24576

## 2.3 Dataset

In [10]:
num_examples = 50
length = 3

dataset  = './data/names.json'

examples = load_examples(dataset, num_examples, gpt_model, length=length)

m_inputs = t.cat([e['clean_prefix'] for e in examples], dim=0).to(DEVICE)
f_inputs = t.cat([e['patch_prefix'] for e in examples], dim=0).to(DEVICE)
m_answer_idxs = t.tensor([e['clean_answer'] for e in examples], dtype=t.long, device=DEVICE)
f_answer_idxs = t.tensor([e['patch_answer'] for e in examples], dtype=t.long, device=DEVICE)


# 3. Attribution patching

In [11]:
# Define metric - logit diff - GPT
def metric_fn_gpt(model, clean_answer_idxs, patch_answer_idxs):
    return (
        t.gather(model.lm_head.output[:,-1,:], dim=-1, index=patch_answer_idxs.view(-1, 1)).squeeze(-1) - \
        t.gather(model.lm_head.output[:,-1,:], dim=-1, index=clean_answer_idxs.view(-1, 1)).squeeze(-1)
    )

In [12]:
gpt_effects = patching_effect(
        m_inputs,
        f_inputs,
        gpt_model,
        submodules_gpt,
        dictionaries_gpt,
        metric_fn = metric_fn_gpt,
        method = "ig",
        metric_kwargs={'clean_answer_idxs': m_answer_idxs, 'patch_answer_idxs': f_answer_idxs},
)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [13]:
_, _, _, total_g = gpt_effects
print(total_g)
print('Mean', total_g.mean())

tensor([ 1.3620,  1.8214,  1.0027,  1.2407,  0.3196, -0.0615,  1.0351,  0.9023,
         1.2059,  0.4982,  0.5997,  0.9434,  2.2439,  0.1985,  0.9752,  1.4090,
         1.3567, -0.0089,  0.0506,  1.9677,  1.8285,  1.9017,  2.2235, -0.2364,
         0.9286,  1.5553,  1.2649,  1.5764,  0.0510,  1.0388,  0.9724,  0.7769,
         1.0179,  0.5372,  1.9119,  2.2669,  0.6777,  1.6479,  0.2966,  0.2693,
         1.6358,  0.4645, -0.4090,  1.7394], device='cuda:0')
Mean tensor(1.0227, device='cuda:0')


# 4. Analysis & Interpretation

## 4.1 Most important features

In [14]:
def get_nodes(clean_effects, clean_inputs, submodules):
    nodes = None
    running_total = 0
    with t.no_grad():
        if nodes is None:
            nodes = {k : len(clean_inputs) * v.sum(dim=1).mean(dim=0) for k, v in clean_effects.items()}
        else:
            for k, v in clean_effects.items():
                nodes[k] += len(clean_inputs) * v.sum(dim=1).mean(dim=0)
        running_total += len(clean_inputs)

    nodes = {k : v / running_total for k, v in nodes.items()}
    
    return nodes

In [None]:
gpt_nodes = get_nodes(gpt_effects.effects, m_inputs, submodules_gpt)

In [16]:
# All non-zero nodes
all_gpt_features = get_all_features(gpt_nodes, submodule_names_gpt)
print(f"Total number of features with non-zero activation score: {sum(len(inner_dict) for inner_dict in all_gpt_features.values())}")

Total number of features with non-zero activation score: 263023


In [15]:
# Threshold features
gpt_thres = get_thres_features(gpt_nodes, threshold=0.1, submodule_names=submodule_names_gpt)
print(f"Total number of features with activation score above threshold: {sum(len(inner_dict) for inner_dict in gpt_thres.values())}")

Total number of features with activation score above threshold: 48


In [30]:
top30_gpt = get_topk_features(gpt_nodes, top_n=30, submodule_names=submodule_names_gpt)
print(top30_gpt)
print(f"Count: {sum(len(inner_dict) for inner_dict in top30_gpt.values())}")

{'resid_0': {}, 'resid_1': {18756: 0.197266086935997, 5742: 0.17898690700531006, 16306: 0.17882031202316284}, 'resid_10': {11094: 0.3505690097808838, 23440: 0.27558112144470215, 20409: 0.20817120373249054, 5875: 0.17232660949230194, 13618: 0.15214762091636658}, 'resid_11': {4077: 0.4548037350177765, 13642: 0.26605644822120667, 5210: 0.2547995150089264, 8252: 0.15805856883525848, 24199: 0.141384094953537}, 'resid_2': {}, 'resid_3': {8216: 0.28700247406959534}, 'resid_4': {2911: 0.23512524366378784, 13416: 0.13550598919391632}, 'resid_5': {8578: 0.4652336835861206, 15506: 0.14001810550689697}, 'resid_6': {19260: 0.48122021555900574, 13066: 0.18763355910778046, 1545: 0.13741624355316162}, 'resid_7': {23247: 0.5228695869445801, 10619: 0.15829448401927948, 9058: 0.13447244465351105}, 'resid_8': {600: 0.42299896478652954, 15707: 0.23514270782470703, 1007: 0.16084255278110504}, 'resid_9': {7119: 0.3635866641998291, 14674: 0.2599882185459137, 20378: 0.1509484052658081}}
Count: 30


## 4.2. Visualise features

In [35]:
def transform_dict_gpt(data_dict, model_id):
    
    layer_mapping = {
            'resid_': 'res-jb',
    }
    result = []
    
    for component_idx, feature_indices in data_dict.items():
        # Determine the type of layer based on the key
        layer = component_idx
        layer_prefix = ''.join([i for i in layer if not i.isdigit()])
        layer_suffix = ''.join([i for i in layer if i.isdigit()])
        layer_key = f"{layer_mapping.get(layer_prefix, layer_prefix)}"

        # Add the information for each index in this layer
        for index in feature_indices.keys():
            result.append({
                "modelId": model_id,
                "layer": f"{layer_suffix}-{layer_key}",
                "index": str(index)
            })

    return result

In [24]:
import urllib.parse
import webbrowser
import json

def get_neuronpedia_quicklist(features_list):
    LIST_NAME = "Gender features"
    LIST_FEATURES = features_list

    url = "https://neuronpedia.org/quick-list/"
    name = urllib.parse.quote(LIST_NAME)
    url = url + "?name=" + name
    url = url + "&features=" + urllib.parse.quote(json.dumps(LIST_FEATURES))

    print("Opening: " + url)
    webbrowser.open(url)

In [36]:
gpt_features = transform_dict_gpt(gpt_thres, 'gpt2-small')
print(gpt_features)
get_neuronpedia_quicklist(gpt_features)

[{'modelId': 'gpt2-small', 'layer': '0-res-jb', 'index': '3161'}, {'modelId': 'gpt2-small', 'layer': '0-res-jb', 'index': '10037'}, {'modelId': 'gpt2-small', 'layer': '1-res-jb', 'index': '195'}, {'modelId': 'gpt2-small', 'layer': '1-res-jb', 'index': '5742'}, {'modelId': 'gpt2-small', 'layer': '1-res-jb', 'index': '7216'}, {'modelId': 'gpt2-small', 'layer': '1-res-jb', 'index': '16306'}, {'modelId': 'gpt2-small', 'layer': '1-res-jb', 'index': '18186'}, {'modelId': 'gpt2-small', 'layer': '1-res-jb', 'index': '18756'}, {'modelId': 'gpt2-small', 'layer': '1-res-jb', 'index': '23465'}, {'modelId': 'gpt2-small', 'layer': '2-res-jb', 'index': '5557'}, {'modelId': 'gpt2-small', 'layer': '3-res-jb', 'index': '8216'}, {'modelId': 'gpt2-small', 'layer': '4-res-jb', 'index': '1544'}, {'modelId': 'gpt2-small', 'layer': '4-res-jb', 'index': '2911'}, {'modelId': 'gpt2-small', 'layer': '4-res-jb', 'index': '13416'}, {'modelId': 'gpt2-small', 'layer': '5-res-jb', 'index': '8578'}, {'modelId': 'gpt2-s

In [38]:
top30_gpt_feat = transform_dict_gpt(top30_gpt, 'gpt2-small')
get_neuronpedia_quicklist(top30_gpt_feat)

Opening: https://neuronpedia.org/quick-list/?name=Gender%20features&features=%5B%7B%22modelId%22%3A%20%22gpt2-small%22%2C%20%22layer%22%3A%20%221-res-jb%22%2C%20%22index%22%3A%20%2218756%22%7D%2C%20%7B%22modelId%22%3A%20%22gpt2-small%22%2C%20%22layer%22%3A%20%221-res-jb%22%2C%20%22index%22%3A%20%225742%22%7D%2C%20%7B%22modelId%22%3A%20%22gpt2-small%22%2C%20%22layer%22%3A%20%221-res-jb%22%2C%20%22index%22%3A%20%2216306%22%7D%2C%20%7B%22modelId%22%3A%20%22gpt2-small%22%2C%20%22layer%22%3A%20%2210-res-jb%22%2C%20%22index%22%3A%20%2211094%22%7D%2C%20%7B%22modelId%22%3A%20%22gpt2-small%22%2C%20%22layer%22%3A%20%2210-res-jb%22%2C%20%22index%22%3A%20%2223440%22%7D%2C%20%7B%22modelId%22%3A%20%22gpt2-small%22%2C%20%22layer%22%3A%20%2210-res-jb%22%2C%20%22index%22%3A%20%2220409%22%7D%2C%20%7B%22modelId%22%3A%20%22gpt2-small%22%2C%20%22layer%22%3A%20%2210-res-jb%22%2C%20%22index%22%3A%20%225875%22%7D%2C%20%7B%22modelId%22%3A%20%22gpt2-small%22%2C%20%22layer%22%3A%20%2210-res-jb%22%2C%20%22index%2