# Benchmarks

Benchmarks to help with design/architecture decisions of the lib.

## Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import gzip
import os
import shutil
import tempfile
import random
import numpy as np
import tqdm

import pandas as pd
import torch
from torch import Tensor
import time
from datasets import load_dataset
from transformer_lens import HookedTransformer

from sparse_autoencoder.autoencoder.model import SparseAutoencoder
from jaxtyping import Float
from sparse_autoencoder.activation_store.list_store import ListActivationStore
from sparse_autoencoder.activation_store.tensor_store import TensorActivationStore
from sparse_autoencoder.activation_store.disk_store import DiskActivationStore
from sparse_autoencoder.train import pipeline

## Activation Tensor Sizes

It's useful to know both the size and how much they can be compressed.

In [None]:
# Create a batch of text data
dataset = load_dataset("NeelNanda/c4-code-tokenized-2b", split="train", streaming=True)
first_batch = []
for idx, example in enumerate(dataset):
    if not idx <= 24:
        break
    first_batch.append(example["tokens"])
first_batch = torch.tensor(first_batch)
f"Number of activations to store in this benchmark test: {first_batch.numel()}"

In [None]:
# Create the activations
src_model = HookedTransformer.from_pretrained("NeelNanda/GELU_1L512W_C4_Code")
logits, cache = src_model.run_with_cache(first_batch)
activations = cache["blocks.0.mlp.hook_post"].half()
number_activations = activations.numel()
size_bytes_activations = number_activations * 2  # Assume float 16
size_mb_activations = f"{size_bytes_activations / (10**6):.2f} MB"
f"With {activations.numel()} features at half precision, the features take up {size_mb_activations} of memory"

Next we try compressing on the disk (and find the impact is small so probably not worth it):

In [None]:
# Save to temp dir
temp_dir = tempfile.gettempdir()
temp_file = temp_dir + "/temp.pt"
temp_file_gz = temp_file + ".gz"
torch.save(activations, temp_file)

# Zip it
with open(temp_file, "rb") as f_in:
    with gzip.open(temp_file_gz, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

# Get the file size back
fs_bytes = os.path.getsize(temp_file_gz)
f"Compressed file size is {fs_bytes / (10**6):.2f} MB"

Now let's calculate assuming 8 billion activations:

In [None]:
assumed_n_activation_batches = 8 * (10**9)
assumed_n_activations_per_batch = 2048
uncompressed_size_per_activation = 2  # float16
estimated_size = (
    assumed_n_activation_batches
    * assumed_n_activations_per_batch
    * uncompressed_size_per_activation
)
f"With {assumed_n_activation_batches/10**9}B activations with {assumed_n_activations_per_batch} features, \
the estimated size is {estimated_size / (10**12):.2f} TB"

In [None]:
# Calculate the amount of activations you can store with different sizes
sizes_gb = [10, 50, 100, 300, 500, 1000]
activations_per_size = [
    i * (10**9) / uncompressed_size_per_activation / assumed_n_activations_per_batch
    for i in sizes_gb
]

table = pd.DataFrame({"Size (GB)": sizes_gb, "Activations": activations_per_size})
table["Activations"] = table["Activations"].apply(
    lambda x: "{:,.0f}".format(x / 10**6) + "M"
)
table

VastAI systems often have quite a lot of HD space (e.g. 300GB) but available ram is often smaller
(e.g. 50GB and we need a reasonable amount left over for moving tensors around etc). This means that
we can store c. 5-10M activations on a typical instance in CPU RAM (sometimes 25M+), or 50-100M on
disk. Both seem like plenty!

To note that replenishing a buffer of cached activations when half used in training seems like a lot
of pain, considering that the improvement is likely marginal. Particularly if we also randomly sort
the prompts for the forward pass of the source model, we'll have a chance of two tokens coming from
the same/nearby prompts as very small.

The conclusion is therefore that we do a need some sort of buffer, as we can't store 40TB on disk
easily, and this buffer can be disk or ram. It needs to store asynchronously (so it doesn't block
the forward pass), and it needs to be able to handle multiple simultaneous writes from e.g.
distributed GPUs. The best approaches here are probably (a) pre-allocating a cpu ram space with
torch.empty, or (b) writing asynchronously to disk.

## Dataset Fetching

## Getting Activations (Forward Pass)

## Activations Store

### Storage methods

#### Test Setup

In [None]:
# num_items: int = 1_000_000
# num_neurons: int = 2048


# def create_dummy_activations(
#     n_items: int, n_neurons: int
# ) -> list[Float[Tensor, "batch neurons"]]:
#     """Create Dummy Activations for Benchmarks."""
#     batch_size = 1_000
#     n_batches = int(n_items // batch_size)
#     activations = [torch.rand(batch_size, n_neurons) for _ in range(n_batches)]
#     return activations


# dummy_activations = create_dummy_activations(num_items, num_neurons)
# dummy_activations[0].shape

In [None]:
# benchmarks_to_run = {
#     "GPU Tensor": TensorActivationStore(
#         max_items=num_items, num_neurons=num_neurons, device=torch.device("cuda")
#     ),
#     "CPU Tensor": TensorActivationStore(
#         max_items=num_items, num_neurons=num_neurons, device=torch.device("cpu")
#     ),
#     "CPU List, No Multiprocessing": ListActivationStore(),
#     "CPU List, Multiprocessing (multiple GPUs)": ListActivationStore(
#         multiprocessing_enabled=True
#     ),
#     "Disk": DiskActivationStore(empty_dir=True, max_cache_size=100_000),
# }

#### Write Tests

Note that where multiprocessing is enabled, this will incur a large time cost and no significant benefits
realised here. With multiple GPUs however this may be faster.

In [None]:
# results = {}

# for name, store in tqdm.tqdm(benchmarks_to_run.items()):
#     store.empty()
#     start_time = time.time()
#     for batch in dummy_activations:
#         store.extend(batch)
#     if hasattr(store, "wait_for_writes_to_complete"):
#         store.wait_for_writes_to_complete()
#     end_time = time.time()
#     results[name] = end_time - start_time

# df = pd.DataFrame(results, index=["Time (s)"]).T
# df["Time 10B (h estimate)"] = df["Time (s)"] * 10**10 / num_items / 3600
# df

## Learning