# 0. Imports

In [1]:
import numpy as np
import torch as t

from feature_selection import get_thres_features, get_topk_features, get_diff, get_sim
from feature_vis import transform_list, get_neuronpedia_quicklist
from load_models import ModelLoader

from feature_circuits.loading_utils import load_examples
from feature_circuits.attribution import patching_effect

DEVICE = t.device("cuda" if t.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


# 1. Model and SAE

In [2]:
# Specify model and dictionary path
model_name = "EleutherAI/pythia-70m-deduped"
dictionary_path = "../github/feature-circuits/dictionary_learning" #ADAPT

# Initialize the loader
loader = ModelLoader(model_name=model_name, dictionary_path=dictionary_path)

# Load the model, tokenizer, dictionaries, submodules, and names
model, tokenizer, dictionaries, submodules, submodule_names = loader.load_model()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# 2. Datasets

## 2.1 Names Dataset

Example: 
- male_prefixes = "John runs because", male_answer = "he"
- female_prefixes = "Anna runs because", female_answer = "she"

In [3]:
num_examples = 50
length = 3

dataset_names = 'data/names.json'

examples_names = load_examples(dataset_names, num_examples, model, length=length)
m_inputs = t.cat([e['clean_prefix'] for e in examples_names], dim=0).to(DEVICE)
f_inputs = t.cat([e['patch_prefix'] for e in examples_names], dim=0).to(DEVICE)
m_answer_idxs = t.tensor([e['clean_answer'] for e in examples_names], dtype=t.long, device=DEVICE)
f_answer_idxs = t.tensor([e['patch_answer'] for e in examples_names], dtype=t.long, device=DEVICE)

## 2.2 Baseline Names

Example: 
- male_prefixes = "John runs because", male_answer = **"John"**
- female_prefixes = "Anna runs because", female_answer = **"Anna"**

In [4]:
num_examples = 50
length = 3

baseline_names = 'data/baseline_names.json'

baseline_examples = load_examples(baseline_names, num_examples, model, length=length)
mb_inputs = t.cat([e['clean_prefix'] for e in baseline_examples], dim=0).to(DEVICE)
fb_inputs = t.cat([e['patch_prefix'] for e in baseline_examples], dim=0).to(DEVICE)
mb_answer_idxs = t.tensor([e['clean_answer'] for e in baseline_examples], dtype=t.long, device=DEVICE)
fb_answer_idxs = t.tensor([e['patch_answer'] for e in baseline_examples], dtype=t.long, device=DEVICE)

# 3. Attribution patching

In [5]:
# Define metric - logit diff
def metric_fn(model, clean_answer_idxs, patch_answer_idxs):
    return (
        t.gather(model.embed_out.output[:,-1,:], dim=-1, index=patch_answer_idxs.view(-1, 1)).squeeze(-1) - \
        t.gather(model.embed_out.output[:,-1,:], dim=-1, index=clean_answer_idxs.view(-1, 1)).squeeze(-1)
    )


In [6]:
# clean_logit_diff
def run_clean_logit_diff(clean_prefixes, clean_answers, patch_answers):
    with t.no_grad(), model.trace(clean_prefixes):
        metric_values = metric_fn(model, clean_answers, patch_answers).save()
    return metric_values

## 3.1 Male Names

In [7]:
clean_male = run_clean_logit_diff(clean_prefixes=m_inputs, clean_answers=m_answer_idxs, patch_answers=f_answer_idxs)
print(clean_male.mean())

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor(-1.9530, device='cuda:0')


In [8]:
male_effects = patching_effect(
        m_inputs,
        f_inputs,
        model,
        submodules,
        dictionaries,
        metric_fn = metric_fn,
        method = "ig",
        metric_kwargs={'clean_answer_idxs': m_answer_idxs, 'patch_answer_idxs': f_answer_idxs},
)

In [9]:
_, _, _, n_total = male_effects
print(n_total)
print('Mean Total Effect:', n_total.mean())

tensor([1.7327, 2.3501, 2.4655, 1.6837, 1.8619, 0.8065, 3.3904, 3.4794, 2.0775,
        2.1647, 1.5011, 2.0901, 0.2847, 0.9474, 3.4915, 2.3105, 3.5099, 1.4438,
        4.1980, 1.4552, 1.9852, 0.5535, 1.5405, 2.5065, 2.3644, 0.9592, 4.0538,
        4.0045, 1.8123, 2.7058, 2.9171, 1.0980, 2.3359, 2.7927, 2.2844, 2.0223,
        3.1919, 3.3861, 2.2045, 1.6444, 1.2972, 3.2065, 3.3591, 2.5847, 1.6901,
        1.7651, 2.6559, 1.6400, 2.1608, 2.8781], device='cuda:0')
Mean Total Effect: tensor(2.2569, device='cuda:0')


## 3.2. Female Names

In [10]:
clean_female = run_clean_logit_diff(clean_prefixes=f_inputs, clean_answers=f_answer_idxs, patch_answers=m_answer_idxs)
print(clean_female.mean())

tensor(-0.3039, device='cuda:0')


In [11]:
female_effects = patching_effect(
        f_inputs,
        m_inputs,
        model,
        submodules,
        dictionaries,
        metric_fn = metric_fn,
        method = "ig",
        metric_kwargs={'clean_answer_idxs': f_answer_idxs, 'patch_answer_idxs': m_answer_idxs},
)

In [12]:
_, _, _, fn_total = female_effects
print(fn_total)
print('Mean Total Effect:', fn_total.mean())

tensor([1.7327, 2.3501, 2.4655, 1.6837, 1.8619, 0.8065, 3.3904, 3.4794, 2.0775,
        2.1647, 1.5011, 2.0901, 0.2847, 0.9474, 3.4915, 2.3105, 3.5099, 1.4438,
        4.1980, 1.4552, 1.9852, 0.5535, 1.5405, 2.5065, 2.3644, 0.9592, 4.0538,
        4.0045, 1.8123, 2.7058, 2.9171, 1.0980, 2.3359, 2.7927, 2.2844, 2.0223,
        3.1919, 3.3861, 2.2045, 1.6444, 1.2972, 3.2065, 3.3591, 2.5847, 1.6901,
        1.7651, 2.6559, 1.6400, 2.1608, 2.8781], device='cuda:0')
Mean Total Effect: tensor(2.2569, device='cuda:0')


Same results as male: Shift is treated similarily

## 3.3. Baseline Names

In [13]:
clean_baseline = run_clean_logit_diff(clean_prefixes=mb_inputs, clean_answers=mb_answer_idxs, patch_answers=fb_answer_idxs)
print(clean_baseline.mean())

tensor(-3.0346, device='cuda:0')


In [14]:
baseline_effects = patching_effect(
        mb_inputs,
        fb_inputs,
        model,
        submodules,
        dictionaries,
        metric_fn = metric_fn,
        method = "ig",
        metric_kwargs={'clean_answer_idxs': mb_answer_idxs, 'patch_answer_idxs': fb_answer_idxs},
)

In [15]:
_, _, _, b_total = baseline_effects
print(b_total)
print(b_total.mean())

tensor([4.1400, 4.4175, 4.0861, 3.2670, 3.1908, 3.3660, 3.9788, 4.8951, 2.5903,
        3.5933, 4.1086, 3.4479, 3.8282, 6.2942, 5.5321, 4.9976, 5.5574, 4.1132,
        5.3998, 2.9043, 2.7148, 3.0409, 3.0859, 3.6765, 2.8250, 5.9772, 5.6564,
        6.1344, 3.7196, 6.6122, 4.2739, 4.3126, 4.3051, 6.6014, 3.8149, 5.2384,
        4.2365, 4.4540, 5.3210, 2.9677, 3.8994, 3.0043, 5.2885, 5.5873, 5.7601,
        5.2866, 4.2810, 4.7424, 4.3400, 4.0096], device='cuda:0')
tensor(4.3775, device='cuda:0')


# 4. Feature Analysis and Interpretation

## 4.1 Feature Selection

### 4.1.0 Functions

In [16]:
# Mean of features per submodule (over all examples and tokens)
def get_nodes(clean_effects, clean_inputs, submodules):
    nodes = None
    running_total = 0
    with t.no_grad():
        if nodes is None:
            # 50 * mean over all 3 tokens, mean over all 50 examples of value
            nodes = {k : len(clean_inputs) * v.sum(dim=1).mean(dim=0) for k, v in clean_effects.items()}
        else: # necessary if batches of data are used
            for k, v in clean_effects.items():
                nodes[k] += len(clean_inputs) * v.sum(dim=1).mean(dim=0)
        running_total += len(clean_inputs) # 50 examples
    # Necessary for batches
    nodes = {k : v / running_total for k, v in nodes.items()}
    
    return nodes

### 4.1.1 Male Names

In [17]:
m_nodes = get_nodes(male_effects.effects, m_inputs, submodules)

m_thres = get_thres_features(m_nodes, submodule_names)
print(f"Total number of features with activation score above threshold: {sum(len(inner_dict) for inner_dict in m_thres.values())}")
print(m_thres)

m_top30 = get_topk_features(m_nodes, submodule_names=submodule_names, top_n=30)
print(m_top30)

Total number of features with activation score above threshold: 19
{'attn_0': {31738: -0.1389819085597992}, 'attn_1': {}, 'attn_2': {27472: 0.49865493178367615}, 'attn_3': {2959: 0.303644597530365, 19128: 0.10656654089689255}, 'attn_4': {31101: 0.21799395978450775}, 'attn_5': {}, 'mlp_0': {}, 'mlp_1': {25018: 0.36824148893356323}, 'mlp_2': {}, 'mlp_3': {}, 'mlp_4': {}, 'mlp_5': {26689: -0.10129749029874802}, 'resid_0': {9651: 0.3808836340904236}, 'resid_1': {9877: 0.29394519329071045, 15963: 0.10944902151823044, 30248: 0.387736052274704}, 'resid_2': {1995: 0.1938902884721756, 29295: 0.3514541983604431}, 'resid_3': {19558: 0.5201192498207092, 27334: 0.24804769456386566}, 'resid_4': {12420: 0.708912193775177, 30220: 0.3863234221935272}, 'resid_5': {10643: 0.5425660014152527, 26074: 0.8665501475334167}}
{'attn_0': {31738: -0.1389819085597992, 25800: -0.08653032034635544, 2988: -0.07384327054023743, 19062: 0.06987809389829636}, 'attn_1': {}, 'attn_2': {27472: 0.49865493178367615}, 'attn_3'

### 4.1.2 Female Names

In [18]:
f_nodes = get_nodes(female_effects.effects, f_inputs, submodules)

f_thres = get_thres_features(f_nodes, submodule_names)
print(f"Total number of features with activation score above threshold: {sum(len(inner_dict) for inner_dict in f_thres.values())}")
print(f_thres)

f_top30 = get_topk_features(f_nodes, submodule_names=submodule_names, top_n=30)
print(f_top30)

Total number of features with activation score above threshold: 20
{'attn_0': {}, 'attn_1': {}, 'attn_2': {27472: 0.40433958172798157}, 'attn_3': {2959: 0.24591399729251862, 19128: 0.13039447367191315}, 'attn_4': {31101: 0.1600901186466217}, 'attn_5': {}, 'mlp_0': {16055: 0.10456226021051407}, 'mlp_1': {8522: -0.15091067552566528, 25018: 0.26036110520362854}, 'mlp_2': {}, 'mlp_3': {}, 'mlp_4': {}, 'mlp_5': {26689: -0.10205663740634918}, 'resid_0': {9651: 0.38835543394088745}, 'resid_1': {9877: 0.2894383668899536, 15963: 0.10367652028799057, 30248: 0.377223402261734}, 'resid_2': {1995: 0.18619796633720398, 29295: 0.3377988338470459}, 'resid_3': {19558: 0.4999954104423523, 27334: 0.2575014531612396}, 'resid_4': {12420: 0.6789379715919495, 30220: 0.4032552242279053}, 'resid_5': {10643: 0.5444259643554688, 26074: 0.8653365969657898}}
{'attn_0': {31738: 0.08613160252571106, 23693: 0.08561816066503525}, 'attn_1': {}, 'attn_2': {27472: 0.40433958172798157}, 'attn_3': {2959: 0.2459139972925186

### 4.1.3 Baseline

In [19]:
b_nodes = get_nodes(baseline_effects.effects, mb_inputs, submodules)

b_thres = get_thres_features(b_nodes, submodule_names)
print(f"Total number of features with activation score above threshold: {sum(len(inner_dict) for inner_dict in b_thres.values())}")
print(b_thres)

b_top30 = get_topk_features(b_nodes, submodule_names=submodule_names, top_n=30)
print(b_top30)

Total number of features with activation score above threshold: 19
{'attn_0': {2988: -0.10736305266618729, 15865: 0.10901418328285217, 19062: 0.14401990175247192, 23693: -0.10614116489887238, 25800: -0.14385369420051575, 29589: -0.2672913372516632, 29599: -0.17075666785240173, 31738: -0.2666606903076172}, 'attn_1': {}, 'attn_2': {27472: 0.13347101211547852}, 'attn_3': {}, 'attn_4': {}, 'attn_5': {}, 'mlp_0': {}, 'mlp_1': {7984: -0.1012866273522377, 8522: 0.1843944936990738, 14367: -0.2507795989513397}, 'mlp_2': {}, 'mlp_3': {}, 'mlp_4': {}, 'mlp_5': {}, 'resid_0': {9651: 0.1400410681962967}, 'resid_1': {9877: 0.10479801893234253, 30248: 0.10153563320636749}, 'resid_2': {29295: 0.11740456521511078}, 'resid_3': {19558: 0.13460884988307953}, 'resid_4': {12420: 0.16600289940834045}, 'resid_5': {26074: 0.17515209317207336}}
{'attn_0': {29589: -0.2672913372516632, 31738: -0.2666606903076172, 29599: -0.17075666785240173, 19062: 0.14401990175247192, 25800: -0.14385369420051575, 15865: 0.109014

## 4.2 Visualisation via Neuronpedia

In [20]:
m_features = transform_list(m_thres, 'pythia-70m-deduped')
get_neuronpedia_quicklist(m_features)

Opening: https://neuronpedia.org/quick-list/?name=Gender%20features&features=%5B%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%220-att-sm%22%2C%20%22index%22%3A%20%2231738%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%222-att-sm%22%2C%20%22index%22%3A%20%2227472%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%223-att-sm%22%2C%20%22index%22%3A%20%222959%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%223-att-sm%22%2C%20%22index%22%3A%20%2219128%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%224-att-sm%22%2C%20%22index%22%3A%20%2231101%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%221-mlp-sm%22%2C%20%22index%22%3A%20%2225018%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%225-mlp-sm%22%2C%20%22index%22%3A%20%2226689%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-ded

In [21]:
f_features = transform_list(f_thres, 'pythia-70m-deduped')
get_neuronpedia_quicklist(f_features, "Gender features female")

Opening: https://neuronpedia.org/quick-list/?name=Gender%20features%20female&features=%5B%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%222-att-sm%22%2C%20%22index%22%3A%20%2227472%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%223-att-sm%22%2C%20%22index%22%3A%20%222959%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%223-att-sm%22%2C%20%22index%22%3A%20%2219128%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%224-att-sm%22%2C%20%22index%22%3A%20%2231101%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%220-mlp-sm%22%2C%20%22index%22%3A%20%2216055%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%221-mlp-sm%22%2C%20%22index%22%3A%20%228522%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%221-mlp-sm%22%2C%20%22index%22%3A%20%2225018%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia

In [23]:
b_features = transform_list(b_thres, 'pythia-70m-deduped')
get_neuronpedia_quicklist(b_features, "Gender features baseline")

Opening: https://neuronpedia.org/quick-list/?name=Gender%20features%20baseline&features=%5B%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%220-att-sm%22%2C%20%22index%22%3A%20%222988%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%220-att-sm%22%2C%20%22index%22%3A%20%2215865%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%220-att-sm%22%2C%20%22index%22%3A%20%2219062%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%220-att-sm%22%2C%20%22index%22%3A%20%2223693%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%220-att-sm%22%2C%20%22index%22%3A%20%2225800%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%220-att-sm%22%2C%20%22index%22%3A%20%2229589%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%220-att-sm%22%2C%20%22index%22%3A%20%2229599%22%7D%2C%20%7B%22modelId%22%3A%20%22pyt

#### Interpretation

**Top 30 clean = male features:**
NOT = not considered gender
XX = look again, overlapping concepts
AMB = consider to show as ambiguous features

        [{'modelId': 'pythia-70m-deduped', 'layer': '5-res-sm', 'index': '26074'}, FP
        {'modelId': 'pythia-70m-deduped', 'layer': '5-res-sm', 'index': '10643'}, MP
        {'modelId': 'pythia-70m-deduped', 'layer': '5-res-sm', 'index': '31975'}, FW
        {'modelId': 'pythia-70m-deduped', 'layer': '4-res-sm', 'index': '12420'}, FP
        {'modelId': 'pythia-70m-deduped', 'layer': '4-res-sm', 'index': '30220'}, MP
        {'modelId': 'pythia-70m-deduped', 'layer': '3-res-sm', 'index': '19558'}, FP
        {'modelId': 'pythia-70m-deduped', 'layer': '3-res-sm', 'index': '27334'}, MP
        {'modelId': 'pythia-70m-deduped', 'layer': '2-att-sm', 'index': '27472'}, FP
        {'modelId': 'pythia-70m-deduped', 'layer': '1-res-sm', 'index': '30248'}, end FN
        {'modelId': 'pythia-70m-deduped', 'layer': '1-res-sm', 'index': '9877'}, FN
        {'modelId': 'pythia-70m-deduped', 'layer': '1-res-sm', 'index': '15963'}, FN
        {'modelId': 'pythia-70m-deduped', 'layer': '0-res-sm', 'index': '9651'}, FN
        {'modelId': 'pythia-70m-deduped', 'layer': '0-res-sm', 'index': '7972'}, FN
        {'modelId': 'pythia-70m-deduped', 'layer': '0-res-sm', 'index': '2913'}, FN
        {'modelId': 'pythia-70m-deduped', 'layer': '0-res-sm', 'index': '19522'}, MN token NOT
        {'modelId': 'pythia-70m-deduped', 'layer': '1-mlp-sm', 'index': '25018'}, end FN
        {'modelId': 'pythia-70m-deduped', 'layer': '2-res-sm', 'index': '29295'}, FP end FN XX
        {'modelId': 'pythia-70m-deduped', 'layer': '2-res-sm', 'index': '1995'}, FP + FW
        {'modelId': 'pythia-70m-deduped', 'layer': '3-att-sm', 'index': '2959'}, FP (AMB?)
        {'modelId': 'pythia-70m-deduped', 'layer': '3-att-sm', 'index': '19128'}, MP
        {'modelId': 'pythia-70m-deduped', 'layer': '4-att-sm', 'index': '31101'}, FP
        {'modelId': 'pythia-70m-deduped', 'layer': '0-att-sm', 'index': '31738'}, MN NOT
    X   {'modelId': 'pythia-70m-deduped', 'layer': '0-att-sm', 'index': '25800'}, NOT
    X   {'modelId': 'pythia-70m-deduped', 'layer': '0-att-sm', 'index': '2988'}, NOT
    X   {'modelId': 'pythia-70m-deduped', 'layer': '0-att-sm', 'index': '19062'}, NOT
        {'modelId': 'pythia-70m-deduped', 'layer': '5-mlp-sm', 'index': '26689'}, MAYBE !!!!
    X   {'modelId': 'pythia-70m-deduped', 'layer': '5-mlp-sm', 'index': '27658'}, NOT
    X   {'modelId': 'pythia-70m-deduped', 'layer': '0-mlp-sm', 'index': '30328'}, FN AMB
        {'modelId': 'pythia-70m-deduped', 'layer': '0-mlp-sm', 'index': '9566'}, FN AMB
    X   {'modelId': 'pythia-70m-deduped', 'layer': '0-mlp-sm', 'index': '9133'}] FN AMB

**Top 30 clean = female features:**
NOT = not considered gender
XX = look again, overlapping concepts
AMB = consider to show as ambiguous features

        {'modelId': 'pythia-70m-deduped', 'layer': '5-res-sm', 'index': '26074'}, FP
        {'modelId': 'pythia-70m-deduped', 'layer': '5-res-sm', 'index': '10643'}, MP
        {'modelId': 'pythia-70m-deduped', 'layer': '5-res-sm', 'index': '31975'}, FW
        {'modelId': 'pythia-70m-deduped', 'layer': '4-res-sm', 'index': '12420'}, FP
        {'modelId': 'pythia-70m-deduped', 'layer': '4-res-sm', 'index': '30220'}, MP
        {'modelId': 'pythia-70m-deduped', 'layer': '3-res-sm', 'index': '19558'}, FP
        {'modelId': 'pythia-70m-deduped', 'layer': '3-res-sm', 'index': '27334'}, MP
        {'modelId': 'pythia-70m-deduped', 'layer': '2-att-sm', 'index': '27472'}, FP + FW !!!
        {'modelId': 'pythia-70m-deduped', 'layer': '1-res-sm', 'index': '30248'}, end FN
        {'modelId': 'pythia-70m-deduped', 'layer': '1-res-sm', 'index': '9877'}, FN
        {'modelId': 'pythia-70m-deduped', 'layer': '1-res-sm', 'index': '15963'}, FN
        {'modelId': 'pythia-70m-deduped', 'layer': '0-res-sm', 'index': '9651'}, FN
        {'modelId': 'pythia-70m-deduped', 'layer': '0-res-sm', 'index': '7972'}, FN
        {'modelId': 'pythia-70m-deduped', 'layer': '0-res-sm', 'index': '2913'}, FN
        {'modelId': 'pythia-70m-deduped', 'layer': '0-res-sm', 'index': '19522'}, MN token NOT
        {'modelId': 'pythia-70m-deduped', 'layer': '1-mlp-sm', 'index': '25018'}, end FN
        {'modelId': 'pythia-70m-deduped', 'layer': '2-res-sm', 'index': '29295'}, FP end FN !!!
        {'modelId': 'pythia-70m-deduped', 'layer': '2-res-sm', 'index': '1995'}, FP + FW
        {'modelId': 'pythia-70m-deduped', 'layer': '3-att-sm', 'index': '2959'}, FP (AMB?)
        {'modelId': 'pythia-70m-deduped', 'layer': '3-att-sm', 'index': '19128'}, MP
        {'modelId': 'pythia-70m-deduped', 'layer': '4-att-sm', 'index': '31101'}, FP
        {'modelId': 'pythia-70m-deduped', 'layer': '0-att-sm', 'index': '31738'}, MN NOT
        {'modelId': 'pythia-70m-deduped', 'layer': '5-mlp-sm', 'index': '26689'}, MAYBE
        {'modelId': 'pythia-70m-deduped', 'layer': '0-mlp-sm', 'index': '9566'}, FN AMB

    X   {'modelId': 'pythia-70m-deduped', 'layer': '1-mlp-sm', 'index': '8522'}, NOT
    X   {'modelId': 'pythia-70m-deduped', 'layer': '1-mlp-sm', 'index': '14367'}, NOT
    X   {'modelId': 'pythia-70m-deduped', 'layer': '0-mlp-sm', 'index': '16055'}, MN AMB ?
    X   {'modelId': 'pythia-70m-deduped', 'layer': '0-mlp-sm', 'index': '20653'}, FN AMB ??
    X   {'modelId': 'pythia-70m-deduped', 'layer': '0-mlp-sm', 'index': '16856'}, No act
    X   {'modelId': 'pythia-70m-deduped', 'layer': '0-att-sm', 'index': '23693'} NOT

------

## 4.3 Comparison of Male Names to Baseline

### 4.5.1 Feature Differences

In [24]:
diff = get_diff(m_top30, b_top30)
print(f"Total number of differing features: {sum(len(inner_dict) for inner_dict in diff.values())}")

Total number of differing features: 28


In [25]:
diff_feature = transform_list(diff, 'pythia-70m-deduped')
get_neuronpedia_quicklist(diff_feature, "Differing Features Top 30 Male-Baseline")

Opening: https://neuronpedia.org/quick-list/?name=Differing%20Features%20Top%2030%20Male-Baseline&features=%5B%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%220-att-sm%22%2C%20%22index%22%3A%20%227563%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%220-att-sm%22%2C%20%22index%22%3A%20%2223693%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%220-att-sm%22%2C%20%22index%22%3A%20%2229589%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%220-att-sm%22%2C%20%22index%22%3A%20%2215865%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%220-att-sm%22%2C%20%22index%22%3A%20%2229599%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%223-att-sm%22%2C%20%22index%22%3A%20%2219128%22%7D%2C%20%7B%22modelId%22%3A%20%22pythia-70m-deduped%22%2C%20%22layer%22%3A%20%223-att-sm%22%2C%20%22index%22%3A%20%222959%22%7D%2C%20%7B%22mode

### 4.5.2 Similarities

Features that occur in the Top features dictionaries of both datasets

In [26]:
# Similar features
sim = get_sim(m_top30, b_top30)
print(f"Total number of features occuring in both dicts: {sum(len(inner_dict) for inner_dict in sim.values())}")

Total number of features occuring in both dicts: 16


In [None]:
sim_feature = transform_list(sim, 'pythia-70m-deduped')
get_neuronpedia_quicklist(sim_feature, "Similar Features Top 30 Male-Baseline")

Gender features:

    - 2-Att-27472
    - 3-Att-2959
    - 2-RES-29295
    - 3-RES-19558
    - 4-RES-12420
    - 4-RES-30220
    - 5-RES-26074
    - 5-RES-10643