In [None]:
import os
import torch as t
from huggingface_hub import hf_hub_download


from circuits.utils import (
    othello_hf_dataset_to_generator,
    get_model,
)
from circuits.dictionary_learning.buffer import NNsightActivationBuffer
from circuits.dictionary_learning.dictionary import AutoEncoder, GatedAutoEncoder, AutoEncoderNew, IdentityDict

In [None]:
layer = 5
transcoder = False
device='cuda:0'
repo_dir = '/share/u/can/OthelloUnderstanding'

In [None]:
# model and data parameters
model_name = "Baidicoot/Othello-GPT-Transformer-Lens"
dataset_name = "taufeeque/othellogpt"
context_length = 59
activation_dim = 512  # output dimension of the layer

buffer_size = int(3e4 / 4)
llm_batch_size = 128 # 256 for A100 GPU, 64 for 1080ti
sae_batch_size = 8192
num_tokens = 200_000_000    

# Initialize model, data and activation buffer
model = get_model(model_name, device)
data = othello_hf_dataset_to_generator(
    dataset_name, context_length=context_length, split="train", streaming=True
)
if transcoder:
    io = "in"
    submodule = model.blocks[layer].mlp
else:
    io = "out"
    # submodule = model.blocks[layer].hook_resid_post # resid_post
    submodule = model.blocks[layer].hook_mlp_out # resid_post
    # submodule = model.blocks[layer].mlp.hook_post # resid_pre

activation_buffer = NNsightActivationBuffer(
    data,
    model,
    submodule,
    n_ctxs=buffer_size,
    ctx_len=context_length,
    refresh_batch_size=llm_batch_size,
    out_batch_size=sae_batch_size,
    io=io,
    d_submodule=activation_dim,
    device=device,
)

In [None]:
# Load the sae
node_type = "sae_feature"
# node_type = "mlp_neuron"

if node_type == "sae_feature":
    ae_group_name = 'othello_mlp_out_all_layers_panneal_0628'
    ae_type = 'p_anneal'
    trainer_id = 0
    ae_path = f'{repo_dir}/autoencoders/{ae_group_name}/layer_{layer}/trainer{trainer_id}'

elif node_type == "mlp_neuron":
    ae_group_name = 'othello_mlp_acts_identity_aes_lines' # with_lines
    ae_type = 'identity'
    ae_path = f'{repo_dir}/autoencoders/{ae_group_name}/layer_{layer}'
else:
    raise ValueError('Invalid node_type')

# download data from huggingface if needed
if not os.path.exists(f'{repo_dir}/autoencoders/{ae_group_name}'):
    hf_hub_download(repo_id='adamkarvonen/othello_saes', filename=f'{ae_group_name}.zip', local_dir=f'{repo_dir}/autoencoders')
    # unzip the data
    os.system(f'unzip {repo_dir}/autoencoders/{ae_group_name}.zip -d {repo_dir}/autoencoders')

# Initialize the autoencoder
if ae_type == 'standard' or ae_type == 'p_anneal':
    ae = AutoEncoder.from_pretrained(os.path.join(ae_path, 'ae.pt'), device='cuda:0')
elif ae_type == 'gated' or ae_type == 'gated_anneal':
    ae = GatedAutoEncoder.from_pretrained(os.path.join(ae_path, 'ae.pt'), device='cuda:0')
elif ae_type == 'standard_new':
    ae = AutoEncoderNew.from_pretrained(os.path.join(ae_path, 'ae.pt'), device='cuda:0')
elif ae_type == 'identity':
    ae = IdentityDict()
else:
    raise ValueError('Invalid ae_type')

In [None]:
from circuits.dictionary_learning.evaluation import evaluate

eval_result = evaluate(
    ae,
    activation_buffer,
    max_len=context_length,
    batch_size=sae_batch_size,
    io=io,
    tracer_args={'scan': False, 'validate':False},
    device=device,
)

In [None]:
eval_result

learning rate
(expansion factor)
sparsity_penalty
