Finalizing TODOs
    First pass writing
    Fill in plots
    Iterate with Sam
    Do colab
    (Find circuits)
    Finish and submit


# Setup

In [5]:
import torch as t
import torch.nn.functional as F
from nnsight import LanguageModel

import sys
sys.path.append('..')
from dictionary_learning import AutoEncoder
from dictionary_learning.buffer import ActivationBuffer
import gc

import pandas as pd
import numpy as np
import umap
import plotly.express as px
import matplotlib.pyplot as plt

In [6]:
DEVICE = "cuda:0"
DEBUGGING = False

if DEBUGGING:
    tracer_kwargs = {'validate' : True, 'scan' : True}
else:
    tracer_kwargs = {'validate' : False, 'scan' : False}

model = LanguageModel(
    "EleutherAI/pythia-70m-deduped",
    device_map = DEVICE,
    dispatch = True,
)
model.config

GPTNeoXConfig {
  "_name_or_path": "EleutherAI/pythia-70m-deduped",
  "architectures": [
    "GPTNeoXForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.1,
  "eos_token_id": 0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neox",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "rope_scaling": null,
  "rotary_emb_base": 10000,
  "rotary_pct": 0.25,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.35.2",
  "use_cache": true,
  "use_parallel_residual": true,
  "vocab_size": 50304
}

In [None]:
# Load MLP submodules
D_MODEL = model.config.hidden_size
DICT_ID = 10
D_SAE = 64 * D_MODEL

# Load for chosen layers
LAYERS = [0, 1, 2, 3, 4, 5]

submodules = {f'mlp{l}': model.gpt_neox.layers[l].mlp for l in LAYERS}
dictionaries = {}
for l, name in zip(LAYERS, submodules.keys()):
    ae = AutoEncoder(D_MODEL, D_SAE).to(DEVICE)
    ae.load_state_dict(t.load(f'/share/projects/dictionary_circuits/autoencoders/pythia-70m-deduped/mlp_out_layer{l}/{DICT_ID}_{D_SAE}/ae.pt'))
    dictionaries[f'mlp{l}'] = ae

# Filter out dead features

In [None]:
from zstandard import ZstdDecompressor
import json
import io

# Load data from the pile
data_path = '/share/data/datasets/pile/the-eye.eu/public/AI/pile/train/00.jsonl.zst'
compressed_file = open(data_path, 'rb')
dctx = ZstdDecompressor()
reader = dctx.stream_reader(compressed_file)
text_stream = io.TextIOWrapper(reader, encoding='utf-8')

def generator():
    for line in text_stream:
        yield json.loads(line)['text']
data_generator = generator()

def tokenized_batch(generator, batch_size, ctx_len):
    try:
        texts = [next(generator) for _ in range(batch_size)]
    except StopIteration:
        raise StopIteration("End of data stream reached")
    
    return model.tokenizer(
        texts,
        return_tensors='pt',
        max_length=ctx_len,
        truncation=True,
        padding=True
        )

# buffer = ActivationBuffer(
#     data,
#     model,
#     submodule,
#     submodule_input_dim=D_MODEL,
#     submodule_output_dim=D_SAE, # output dimension of the model component
#     n_ctxs=1e4, # you can set this higher or lower dependong on your available memory
#     ctx_len=128,
#     io='out', # buffer will return batches of tensors of dimension = submodule's output dimension
#     device='cuda:0' # doesn't have to be the same device that you train your autoencoder on
# ) # buffer will return batches of tensors of dimension = submodule's output dimension

In [None]:
BATCH_SIZE = 100
CTX_LEN = 100

cumulative_feature_act = {name: t.zeros((D_SAE), device='cpu') for name in submodules}

for i in range(100):
    input_batch = tokenized_batch(data_generator, batch_size=BATCH_SIZE, ctx_len=CTX_LEN).to(DEVICE)
    activation_cache = {}
    with t.no_grad(), model.trace(input_batch, **tracer_kwargs):
        # out = model.gpt_neox.layers[0].mlp.output.save()
        for name in submodules:
            x = submodules[name].output
            f = dictionaries[name].encode(x)
            activation_cache[name] = f.save() # [BATCH_SIZE, CTX_LEN, D_SAE]

    # The following loop causes a memory leak
    for name in submodules:
        sparse_acts = activation_cache[name].sum(dim=(0, 1)).to("cpu")
        cumulative_feature_act[name] += sparse_acts

In [None]:
nonzero_idxs = {}
for name in submodules:
    nonzero_idxs[name] = cumulative_feature_act[name].nonzero().flatten()
    percent = nonzero_idxs[name].shape[0] / D_SAE
    print(f"{name}: {nonzero_idxs[name].shape[0]} nonzero activations ({percent:.2f} % alive)")

decoders = {}
for name in submodules:
    decoders[name] = dictionaries[name].decoder.weight.T # [D_SAE, D_MODEL]
    decoders[name] = decoders[name][nonzero_idxs[name]]

# Cosine similarities of SAE decoder vectors

In [None]:
decoder_stack = t.vstack((decoders['mlp1'], decoders['mlp2'], decoders['mlp3'], decoders['mlp4'], decoders['mlp5']))
decoder_stack = F.normalize(decoder_stack, dim=1)

cosine_sim = decoder_stack @ decoder_stack.T
plt.imshow(cosine_sim.cpu().detach().numpy(), cmap='RdBu', interpolation='nearest', vmin=-1, vmax=1)
plt.colorbar()
plt.title("Cosine similarity between decoders")

In [None]:
decoder_mlp5 = F.normalize(decoders['mlp5'], dim=1)
cosine_sim = decoder_mlp5 @ decoder_mlp5.T
plt.imshow(cosine_sim.cpu().detach().numpy(), cmap='RdBu', interpolation='nearest', vmin=-1, vmax=1)
plt.colorbar()
plt.title("Cosine similarities of MLP5 decoder vectors")

# UMAP

In [None]:
def plot_umap(
        vectors0,
        vectors1 = None,
        labels = ['vectors0', 'vectors1'],
        hover_data0 = None,
        hover_data1 = None,
        highlight_features0 = [],
        highlight_features1 = [],
        title = 'UMAP embedding',
        # UMAP parameters
        n_neighbors=15,
        metric='cosine',
        min_dist=0.05,
        n_components=2, # dimension of the UMAP embedding,
        normalize = False,
):
    """
    Fit a UMAP embedding of the dictionary features and return a plotly plot of the result.
    vectorsX: sets of row vectors for UMAP embedding
    """
    if vectors1 is None:
        vectors_all = vectors0
    else:
        vectors_all = np.vstack([vectors0, vectors1])

    if normalize:
        vectors_all = vectors_all / np.linalg.norm(vectors_all, axis=1)[:, None]

    reducer = umap.UMAP(
        n_neighbors=n_neighbors,
        metric=metric,
        min_dist=min_dist,
        n_components=n_components,
    )
    embedding = reducer.fit_transform(vectors_all)

    # DEBUGGING: Create random data in shape of the embedding
    # embedding = np.random.rand(vectors_all.shape[0], n_components)


    df = pd.DataFrame()
    df['umapX'] = embedding[:, 0]
    df['umapY'] = embedding[:, 1]
    if n_components == 3:
        df['umapZ'] = embedding[:, 2]

    if vectors1 is None:
        df['hover_data'] = [f'feature {i}' for i in hover_data0]
        df['label'] = [f'highlight_{labels[0]}' if h0 in highlight_features0 else labels[0] for h0 in hover_data0]
    else:
        df['hover_data'] = [f'feature {i}' for i in t.hstack([hover_data0, hover_data1])]
        labels0 = [f'highlight_{labels[0]}' if h0 in highlight_features0 else labels[0] for h0 in hover_data0]
        labels1 = [f'highlight_{labels[1]}' if h1 in highlight_features1 else labels[1] for h1 in hover_data1]
        df['label'] = labels0 + labels1
        print(df.label.unique())

    if n_components == 2:
        fig = px.scatter(df, x='umapX', y='umapY', color='label', opacity=0.5, hover_data='hover_data')
    elif n_components == 3:
        fig = px.scatter_3d(df, x='umapX', y='umapY', z='umapZ', color='label', opacity=0.5, hover_data='hover_data')
    else:
        raise ValueError("n_components must be 2 or 3")
    fig.update_layout(title=title)
    return fig

In [None]:
# Umap for mlp_5 decoder
plot_umap(
    decoders["mlp5"].cpu().detach().numpy(), 
    labels = ["mlp5"],
    hover_data0 = [f'feature {i}' for i in nonzero_idxs["mlp5"]],
    title = "UMAP of mlp5_decoder",
    normalize = True
)

In [None]:
plot_umap(
    decoders["mlp4"].cpu().detach().numpy(), 
    decoders["mlp5"].cpu().detach().numpy(), 
    labels = ["mlp4", "mlp5"],
    hover_data0 = nonzero_idxs["mlp4"],
    hover_data1 = nonzero_idxs["mlp5"],
    highlight_features0=[20006],
    highlight_features1=[4015, 15980],
    title = "Joint UMAP of mlp4_decoder and mlp5_decoder",
    normalize = True,
)

In [None]:
# Umap for mlp_2 decoder
plot_umap(
    decoders['mlp2'].cpu().detach().numpy(), 
    labels = ["mlp2"],
    hover_data0 = [f'feature {i}' for i in nonzero_idxs["mlp2"]],
    title = "UMAP of mlp2_decoder",
    normalize = True
)

In [None]:
# Umap for mlp_3 decoder
plot_umap(
    decoders["mlp3"].cpu().detach().numpy(), 
    labels = ["mlp3"],
    hover_data0 = [f'feature {i}' for i in nonzero_idxs["mlp3"]],
    title = "UMAP of mlp3_decoder",
    normalize = True
)

In [None]:
plot_umap(
    decoders["mlp2"].cpu().detach().numpy(), 
    decoders["mlp3"].cpu().detach().numpy(), 
    labels = ["mlp2", "mlp3"],
    hover_data0 = nonzero_idxs["mlp2"],
    hover_data1 = nonzero_idxs["mlp3"],
    highlight_features0=[20006],
    highlight_features1=[4015, 15980],
    title = "Joint UMAP of mlp2_decoder and mlp3_decoder",
    normalize = True,
)

In [None]:
feats = """28138
25379
26215
21676
21618
16166
118
5235
3387
5160
5940
7047
27799
13281
30067
13420
19904
29973
"""

feats = [int(f) for f in feats.split('\n') if f]
print(f"Selected features: {feats}")

In [None]:
import webbrowser
def open_neuronpedia(feature_id: int, layer: int = 0):
    path_to_html = f"https://www.neuronpedia.org/pythia-70m-deduped/{layer}-mlp-sm/{feature_id}"

    print(f"Feature {feature_id}")
    webbrowser.open_new_tab(path_to_html)

for feature in feats:
    open_neuronpedia(feature, layer=5)

# Completeness of this set of features the ability to predict 

In [7]:
# Load submodules and dictionaries
embed = model.gpt_neox.embed_in
attns = [layer.attention for layer in model.gpt_neox.layers]
mlps = [layer.mlp for layer in model.gpt_neox.layers]
resids = [layer for layer in model.gpt_neox.layers]

dictionaries = {}
ae = AutoEncoder(D_MODEL, D_SAE).to(DEVICE)
ae.load_state_dict(t.load(f'/share/projects/dictionary_circuits/autoencoders/pythia-70m-deduped/embed/{DICT_ID}_{D_SAE}/ae.pt'))
dictionaries[embed] = ae
for i in range(len(model.gpt_neox.layers)):
    ae = AutoEncoder(D_MODEL, D_SAE).to(DEVICE)
    ae.load_state_dict(t.load(f'/share/projects/dictionary_circuits/autoencoders/pythia-70m-deduped/attn_out_layer{i}/{DICT_ID}_{D_SAE}/ae.pt'))
    dictionaries[attns[i]] = ae

    ae = AutoEncoder(D_MODEL, D_SAE).to(DEVICE)
    ae.load_state_dict(t.load(f'/share/projects/dictionary_circuits/autoencoders/pythia-70m-deduped/mlp_out_layer{i}/{DICT_ID}_{D_SAE}/ae.pt'))
    dictionaries[mlps[i]] = ae

    ae = AutoEncoder(D_MODEL, D_SAE).to(DEVICE)
    ae.load_state_dict(t.load(f'/share/projects/dictionary_circuits/autoencoders/pythia-70m-deduped/resid_out_layer{i}/{DICT_ID}_{D_SAE}/ae.pt'))
    dictionaries[resids[i]] = ae

NameError: name 'D_MODEL' is not defined

In [26]:
prompts = """
An astronaut thought about an acorn growing into an oak tree in an invisible wire. An economist wrote an article about an apple that could make an espresso in various unexpected combinations. An ice cream vendor dreamed of an artifact at an unexpected depth. An envelope containing an unusual color was left on an empty seat in an altruist spell on an adventure.
An idol decided one day that an alien was necessary for an intergalactic bar. An unusual occurrence happened at an intersection involving an espresso in an empty theater. An actor rehearsed an awkward hour at an improbable depth. An owl in an apple rolled across an uneven table or hooted in an old shop.
An acrobat performed an incredible stunt on an old book laying open with an intriguing letter. An optometrist invented an orchestra that played an overture in an enlightened hall. An engineer on vacation found an umbrella or an altruist involved with an ice truck. An archaeologist dug up an array of peculiar stories on an old map.
An avatar repeated an action endlessly due to an unexpected error in an algorithm. An artist painted an apple with an elaborate process in an economic theory. An odd silence filled the room as an elusive fox disturbed an opera singer. An orchestra played without an audience, or an opera singer performed on an empty seat
"""

prompts = prompts.split(". ")
prompts = [p.strip("\n") for p in prompts if p]

tokenized_prompts = model.tokenizer(prompts, return_tensors='pt', max_length=64, padding=True)
model.tokenizer.encode(" an")

an_tokens = [" An", "An", " an", "an"] # The prompt was explicitly designed to contain these tokens only as articles
an_token_ids = [i[0] for i in model.tokenizer(an_tokens).input_ids]
an_token_ids

token_after_article_mask = t.zeros_like(tokenized_prompts['input_ids'])
for token_id in an_token_ids:
    token_after_article_mask += (tokenized_prompts['input_ids'] == token_id).int()
token_after_article_mask



tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
         0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],
        [1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
         0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0],
        [1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
         0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [27]:
# Metric for the normal forward pass of the model
with model.trace(tokenized_prompts, **tracer_kwargs) as trace:
    clean_logits = model.output.save()

clean_logits[0].shape

torch.Size([13, 39, 50304])

torch.Size([13, 39, 50304])

In [29]:
def perplexity_on_masked_tokens(logits, token_after_article_mask):
    # Calculate perplexity on tokens that follow the article token
    # Mask the tokens that follow the article token
    log_probs = t.log(F.softmax(logits, dim=-1))
    return log_probs[token_after_article_mask].sum()
    
perplexity_on_masked_tokens(clean_logits[0], token_after_article_mask)

tensor(-inf, device='cuda:0', grad_fn=<SumBackward0>)

In [None]:
# Which other features have a high attribution score for my metric