# Inspect SAE activations Othello GPT 

Rico Angell trained a bunch of SAEs on OthelloGPT layer 5 resid post. Use this notebook to retrieve SAE feature activations.

Running this notebook requires cloning **THE COLLAB BRANCH** of Sam Marks' dictionary learning repo: https://github.com/saprmarks/dictionary_learning

In [None]:
import os

from huggingface_hub import hf_hub_download
from datasets import load_dataset
from transformer_lens import HookedTransformer
from nnsight import NNsight

import sys
sys.path.append('your-path-to-dictionary-learning-repo')
from dictionary_learning.buffer import NNsightActivationBuffer
from dictionary_learning.dictionary import AutoEncoder, AutoEncoderNew, GatedAutoEncoder

repo_dir = 'your-path-to-own-experiments-repo'
device = 'cuda:0'

In [None]:
# download data from huggingface if needed
if not os.path.exists(f'{repo_dir}/autoencoders/group-2024-05-17_othello'):
    hf_hub_download(repo_id='adamkarvonen/othello_saes', filename='group-2024-05-17_othello.zip', local_dir=f'{repo_dir}/autoencoders')
    # unzip the data
    os.system(f'unzip {repo_dir}/autoencoders/group-2024-05-17_othello.zip -d autoencoders')

In [None]:
# load SAE
ae_type = 'standard_new'
trainer_id = 0

ae_path = f'{repo_dir}/autoencoders/group-2024-05-17_othello/group-2024-05-17_othello-{ae_type}/trainer{trainer_id}'
if ae_type == 'standard':
    ae = AutoEncoder.from_pretrained(os.path.join(ae_path, 'ae.pt'), device='cuda:0')
elif ae_type == 'gated':
    ae = GatedAutoEncoder.from_pretrained(os.path.join(ae_path, 'ae.pt'), device='cuda:0')
elif ae_type == 'standard_new':
    ae = AutoEncoderNew.from_pretrained(os.path.join(ae_path, 'ae.pt'), device='cuda:0')
else:
    raise ValueError('Invalid ae_type')

In [None]:
def othello_hf_dataset_to_generator(
    dataset_name: str, context_length: int = 59, split="train", streaming=True
):
    dataset = load_dataset(dataset_name, split=split, streaming=streaming)

    def gen():
        for x in iter(dataset):
            yield x["tokens"][:context_length]

    return gen()

In [None]:
# load model
model_name = "Baidicoot/Othello-GPT-Transformer-Lens"
layer = 5

tf_model = HookedTransformer.from_pretrained("Baidicoot/Othello-GPT-Transformer-Lens")
model = NNsight(tf_model).to(device)
submodule =  model.blocks[layer].hook_resid_post

# load data
context_length = 59
activation_dim = 512  # output dimension of the layer
dataset_name = "taufeeque/othellogpt"
data = othello_hf_dataset_to_generator(
    dataset_name, context_length=context_length, split="train", streaming=True
)
buffer = NNsightActivationBuffer(
    data,
    model,
    submodule,
    n_ctxs=8e3,
    ctx_len=context_length,
    refresh_batch_size=128,
    io="out",
    d_submodule=activation_dim,
    device=device,
)

In [None]:
acts = next(buffer)
ae.encode(acts)