## Find out the L0 distribution for a sweep of SAEs

In [None]:
import os
import json
import torch as t
import pandas as pd
import matplotlib.pyplot as plt

from circuits.utils import (
    othello_hf_dataset_to_generator,
    get_model,
    get_submodule,
)

import sys
home_path = '/share/u/can'
sys.path.append(home_path)
from dictionary_learning.dictionary import AutoEncoder, AutoEncoderNew, GatedAutoEncoder
from dictionary_learning.evaluation import evaluate

device = 'cuda:0'

In [None]:
# load model and data
model_name = "Baidicoot/Othello-GPT-Transformer-Lens"
layer = 5
context_length = 59
activation_dim = 512  # output dimension of the layer
dataset_name = "taufeeque/othellogpt"
game_batch_size = 1000

model = get_model(model_name, device)
submodule = get_submodule(model_name, layer, model)

data = othello_hf_dataset_to_generator(
    dataset_name, context_length=context_length, split="train", streaming=True
)
game_batch = [next(data) for _ in range(game_batch_size)]
game_batch = t.tensor(game_batch, device=device)
print(f'game_batch: {len(game_batch)}')

In [None]:
# SAE loading config and helper functions
ae_type = 'standard'

repo_dir = f'{home_path}/chess-gpt-circuits'
ae_group_path = f'{home_path}/sweep_othello'

def load_ae(ae_group_path, ae_type):
    if ae_type == 'standard':
        return AutoEncoder.from_pretrained(f'{ae_group_path}/ae.pt', device=device)
    elif ae_type == 'gated':
        return GatedAutoEncoder.from_pretrained(f'{ae_group_path}/ae.pt', device=device)
    elif ae_type == 'standard_new':
        return AutoEncoderNew.from_pretrained(f'{ae_group_path}/ae.pt', device=device)
    else:
        raise ValueError('Invalid ae_type')


In [None]:
def compute_mean_l0(ae, submodule, game_batch):
    with t.no_grad(), model.trace(game_batch, scan=False, validate=False):
        x = submodule.output
        f = ae.encode(x).save()
    l0 = (f != 0).float().sum(dim=-1).mean()
    return l0

In [None]:
# Find all folders in ae_group_path
ae_folders = [f for f in os.listdir(ae_group_path) if 'trainer' in f]
l0s = t.zeros(len(ae_folders))
sparsity_coefficients = t.zeros(len(ae_folders))

for i, ae_folder in enumerate(ae_folders):
    ae_dir = os.path.join(ae_group_path, ae_folder)
    ae = load_ae(ae_dir, ae_type)
    l0s[i] = compute_mean_l0(ae, submodule, game_batch)
    with open(f'{ae_dir}/config.json', 'r') as f:
        config = json.load(f)
        sparsity_coefficients[i] = config['trainer']['sparsity_penalty']

df = pd.DataFrame({'ae_folder': ae_folders, 'l0': l0s, 'sparsity_coefficient': sparsity_coefficients})
df.to_csv(f'{ae_group_path}/l0.csv')
df

In [None]:
xy = df[['sparsity_coefficient', 'l0']].sort_values('sparsity_coefficient')

plt.scatter(xy['sparsity_coefficient'], xy['l0'])
plt.ylabel('L0')
plt.xlabel('sparsity_coefficient')
plt.title('L0 distribution in sweep')
plt.savefig(f'{ae_group_path}/sparsitycoeff_vs_l0.png')
plt.show()