In [None]:
from nnsight import LanguageModel
import torch
import matplotlib.pyplot as plt
import chess
import json
from tqdm import tqdm
from typing import Callable

from dictionary_learning import ActivationBuffer
from nanogpt_to_hf_transformers import NanogptTokenizer, convert_nanogpt_model
from dictionary_learning.utils import hf_dataset_to_generator
from dictionary_learning import AutoEncoder

from circuits.utils import get_ae_bundle, AutoEncoderBundle, get_feature

import chess_utils

I run this notebook on my laptop, which has 4GB of VRAM in a RTX 3050, and 64 GB of RAM.

I believe this notebook is actually more CPU bound than GPU bound, as my laptop runs this notebook in less time than a vast.ai RTX 3090.

Step 1: Load the model, dictionary, data, and activation buffers.

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_PATH = "../models/lichess_8layers_ckpt_no_optimizer.pt"
batch_size = 25

autoencoder1_path = "../autoencoders/group0/ef=4_lr=1e-03_l1=1e-01_layer=5/"
autoencoder2_path = "../autoencoders/group0/ef=8_lr=1e-04_l1=1e-03_layer=5/"

# chess_sae_test is 100MB of data, so no big deal to download it
data = hf_dataset_to_generator("adamkarvonen/chess_sae_test", streaming=False)

ae_bundle1 = get_ae_bundle(autoencoder1_path, DEVICE, data, batch_size, model_path="../models/")

Collect feature activations on total_inputs inputs.

In [None]:
total_inputs = 8000
assert total_inputs % batch_size == 0
num_iters = total_inputs // batch_size

features = torch.zeros((total_inputs, ae_bundle1.dictionary_size), device=DEVICE)
for i in tqdm(range(num_iters), total=num_iters, desc="Extracting features"):
    feature = get_feature(ae_bundle1.buffer, ae_bundle1.ae, DEVICE)  # (batch_size, dictionary_size)
    features[i * batch_size : (i + 1) * batch_size, :] = feature

A few plots about various statistics.

In [None]:
firing_rate_per_feature = (features != 0).float().sum(dim=0).cpu() / total_inputs

# Creating the histogram
plt.figure(figsize=(10, 6))
plt.hist(firing_rate_per_feature, bins=50, alpha=0.75, color="blue")
plt.title("Histogram of firing rates for features")
plt.xlabel("Probability")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

In [None]:
firing_rate_per_input = (features != 0).float().sum(dim=-1).cpu() / total_inputs

# Creating the histogram
plt.figure(figsize=(10, 6))
plt.hist(firing_rate_per_input, bins=50, alpha=0.75, color="blue")
plt.title("Percentage of features firing per input")
plt.xlabel("Probability")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

I got this from: https://colab.research.google.com/drive/19Qo9wj5rGLjb6KsB9NkKNJkMiHcQhLqo?usp=sharing#scrollTo=WZMhAzLTvw-u

In [None]:
feat_prob = features.mean(0)
print(feat_prob.shape)
log_freq = (feat_prob + 1e-10).log10()
print(log_freq.shape)

log_freq_np = log_freq.cpu().numpy()

# Creating the histogram
plt.figure(figsize=(10, 6))
plt.hist(log_freq_np, bins=50, alpha=0.75, color="blue")
plt.title("Histogram of log10 of Feature Probabilities")
plt.xlabel("log10(Probability)")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

Get the L0 statistic. Then, get a list of indices for features that fire between 0 and 50% of the time.

In [None]:
print(features.shape)
l0 = (features != 0).float().sum(dim=-1).mean()
print(f"l0: {l0}")

firing_rate_per_feature = (features != 0).float().sum(dim=0) / total_inputs

assert firing_rate_per_feature.shape[0] == ae_bundle1.dictionary_size

mask = (firing_rate_per_feature > 0) & (firing_rate_per_feature < 0.5)
idx = torch.nonzero(mask, as_tuple=False).squeeze()
print(idx.shape)
print(f"\n\nWe have {idx.shape[0]} features that fire between 0 and 50% of the time.")
print(idx[:10])

Next, we collect per dim stats, which include the top tokens it fires on, and the top k inputs and activations per input token.

Rough ballpark times on my RTX 3050: 

2000 dims, 1500 inputs, batch size 50 = 23 seconds

Note that I perform the activation processing on my CPU. This is comparable speed, but much lower VRAM usage.

The current state of the code uses around 30-40GB of RAM of n_dims == 6000, n_inputs == 5000. If we want to scale to more dims and / or inputs, the code should be refactored to "stream" rather than save intermediate results. I have TODOs for this in the code.

If you run out of RAM or VRAM, reduce n_inputs and max_dims.

In [None]:
import importlib
import chess_interp

importlib.reload(chess_interp)

max_dims = 2000
top_k = 30
n_inputs = 1500
batch_size = 25


per_dim_stats1 = chess_interp.examine_dimension_chess(
    ae_bundle1,
    dims=idx[:max_dims],
    n_inputs=n_inputs,
    k=top_k + 1,
    batch_size=batch_size,
    processing_device="cpu",
)

In [None]:
max_acts = []
mean_acts = []
for key in per_dim_stats1.keys():
    max_acts.append(per_dim_stats1[key]['activations'][0][-1])
    for i in range(top_k):
        mean_acts.append(per_dim_stats1[key]['activations'][i][-1])

# print(per_dim_stats1[0]['activations'][0][-1])
print(max(max_acts), min(max_acts))
print(max(mean_acts), min(mean_acts))
print(sum(max_acts) / len(max_acts))
print(sum(mean_acts) / len(mean_acts))

All of the above steps can be performed with `get_ae_stats()`, which we will do for the second autoencoder.

In [None]:
import circuits.sae_stats_collection
importlib.reload(circuits.sae_stats_collection)
from circuits.sae_stats_collection import get_ae_stats

 # TODO getting the eval_results is broken. I think it's because I switched to SAEs on the residual stream, not the mlp output
 # The residual stream returns a tuple of parameters, not a single parameter. This is just a guess though.
per_dim_stats2, eval_result2 = get_ae_stats(autoencoder2_path, max_dims, n_inputs, top_k, batch_size, DEVICE, "../models/")

As we can see below, autoencoder 2 has a terrible L0 of 1911. It has around 1500 features that fire between 0 and 50% of the time. It is expansion factor 8 on d_model == 512.

In [None]:
print(f"Number of features firing between 0 and 50% of the time: {len(per_dim_stats2)}\n")

for key, value in eval_result2.items():
    print(f"{key}: {value}")

This cell looks at syntax related features. Specifically, it looks for features that always fire on a PGN "counting number". In this PGN, I've wrapped the "counting numbers" in brackets.

;<1.>e4 e5 <2.>Nf3 ...

We can easily analyze different syntax related attributes by just passing in a different syntax function, such as one that just finds space indices.

In [None]:
importlib.reload(chess_utils)
from pydantic import BaseModel
from chess_interp import initialize_feature_dictionary
from typing import Optional


class SyntaxResultsConfig(BaseModel):
    dim_count: int = 0
    nonzero_count: int = 0
    syntax_match_idx_count: int = 0
    average_input_length: float = 0.0

# Copy pasted directly from chess_interp.py
def syntax_analysis(
    per_dim_stats: dict,
    minimum_number_of_activations: int,
    top_k: int,
    max_dims: int,
    syntax_function: Callable,
    feature_dict: Optional[dict[int, list[dict]]] = None,
    notebook_usage: bool = False,
    verbose: bool = False,
) -> tuple[SyntaxResultsConfig, dict[int, list[dict]]]:

    if feature_dict is None:
        feature_dict = initialize_feature_dictionary(per_dim_stats)

    results = SyntaxResultsConfig()

    for dim in per_dim_stats:
        results.dim_count += 1
        if results.dim_count >= max_dims:
            break

        decoded_tokens = per_dim_stats[dim]["decoded_tokens"]
        activations = per_dim_stats[dim]["activations"]
        # If the dim doesn't have at least min_num firing activations, skip it
        if activations[minimum_number_of_activations][-1].item() == 0:
            continue
        results.nonzero_count += 1

        inputs = ["".join(string) for string in decoded_tokens]
        inputs = inputs[:top_k]

        num_indices = []
        count = 0

        for i, pgn in enumerate(inputs[:top_k]):
            nums = syntax_function(pgn)
            num_indices.append(nums)

            # If the last token (which contains the max activation for that context) is a number
            # Then we count this firing as a "number index firing"
            if (len(pgn) - 1) in nums:
                count += 1

        if count == top_k:
            if notebook_usage:
                for pgn in inputs[:top_k]:
                    print(pgn)
                print(f"All top {top_k} activations in dim: {dim} are on num indices")
            results.syntax_match_idx_count += 1
            average_input_length = sum(len(pgn) for pgn in inputs[:top_k]) / len(inputs[:top_k])
            results.average_input_length += average_input_length
            feature_dict[dim].append({"name": syntax_function.__name__})

    if results.syntax_match_idx_count > 0:
        results.average_input_length /= results.syntax_match_idx_count

    if verbose:
        print(
            f"Out of {results.dim_count} features, {results.nonzero_count} had at least {minimum_number_of_activations} activations."
        )
        print(
            f"{results.syntax_match_idx_count} features matched on all top {top_k} inputs for our syntax function {syntax_function.__name__}"
        )
        print(
            f"The average length of inputs of pattern matching features was {results.average_input_length:.2f}"
        )

    return results, feature_dict

The first 5 features in the second autoencoder are all syntax features. We can see them below.

Example:

;1.d4 d5 2.h3 c5 3.a3 Nc6 4.e3 e5 5.dxc5 Bxc5 6.b4 Bb6 7

;1.e4 e5 2.c3 d5 3.exd5 Qxd5 4.d4 exd4 5.cxd4 Bb4+ 6

;1.Nh3 e5 2.Na3 d5 3.e3 Bxa3 4.bxa3 Bxh3 5.gxh3 c5 6

;1.e4 e6 2.c4 d5 3.d3 dxc4 4.dxc4 Qxd1+ 5.Kxd1 Nc6 6

;1.d4 d6 2.c4 Nd7 3.Nc3 e5 4.d5 Ne7 5.e4 Ng6 6.Bd3 Be7 7

All top 10 activations in dim: 3 are on num indices

In [None]:
prev_top_k = top_k
top_k = 10
syntax_analysis(per_dim_stats2, top_k, top_k, max_dims=5, syntax_function=chess_utils.find_num_indices, notebook_usage=True, verbose=True)
top_k = prev_top_k

In contrast, in the first 25 features on the first autoencoder, which has an L0 of 25, only three of the first 25 features match our `find_num_indices` pattern match. But, they are probably common opening features, as the top k inputs for each feature are identical, so it's debateable if they are actually syntax matches. This is one of the challenges of automatic evaluations. Maybe enough heuristics would make this acceptable.

In [None]:
prev_top_k = top_k
top_k = 10
syntax_analysis(per_dim_stats1, top_k, top_k, max_dims=25, syntax_function=chess_utils.find_num_indices, notebook_usage=True, verbose=True)
top_k = prev_top_k

Now, we can also do programmatic analysis of board states at max activating input tokens. The procedure is the following:

At each max activating token in an input pgn string, convert the pgn string to a chess board object. Then we can find common board states, such as all inputs have this piece on this square, or all inputs have a pinned piece on the board.

In the case of common board states, we convert every chess board object to a one hot tensor of shape (8, 8, 13) or (rows, cols, num_options). Num options is (blank, white / black pawn, knight, bishop, rook, queen, king). Now, we have the chess boards tensor of shape (top_k, rows, cols, num_options).

So, we just look at every square in (rows, cols, num_options) and see if 100% of squares are 1. Any square with a 100% match is added to common_indices. For example, if every e4 contains a white pawn and every a2 contains a white rook, then we would have (3, 4, 6) and (1, 0, 2) or (4, e, white pawn) and (2, a, white rook).

Note that we also make a one hot tensor for the initial board state, and mask off all initial board state squares from every chess board tensor. If we didn't, any short pgn string would have many activations.

Note that this `board_analysis()` function takes a `Config`, which contains a function to convert a chess board object to a tensor. All of the above steps can be repeated for any Config object. For example, we have a `threat_config`, which makes a one hot tensor of shape (8, 8, 2) or (rows, cols, is_threatened), where is_threatened is if the square is threatened by an opponent. Or we can have a `pin_config`, which makes a one hot tensor of shape (1, 1, 2), which is active if a piece on the board is pinned to its king.

The masking also applies to shape configs like `pin_config`. If we didn't didn't mask off the initial board state (with no pins), the vast majority of features would activate for `no pins on board`.

In [None]:
importlib.reload(chess_utils)
from chess_utils import Config, get_num_classes

from pydantic import BaseModel


class BoardResultsConfig(BaseModel):
    dim_count: int = 0
    nonzero_count: int = 0
    pattern_match_count: int = 0
    total_average_length: float = 0.0
    average_matches_per_dim: float = 0.0
    per_class_dict: dict[int, int]
    board_tracker: list[list[int]]  # shape: (num_rows, num_cols)


# copy pasted directly from chess_interp.py
def board_analysis(
    per_dim_stats: dict,
    minimum_number_of_activations: int,
    top_k: int,
    max_dims: int,
    threshold: float,
    configs: list[Config],
    feature_dict: Optional[dict[int, list[dict]]] = None,
    device: str = "cpu",
    notebook_usage: bool = False,
    verbose: bool = False,
) -> tuple[dict[str, BoardResultsConfig], dict[int, list[dict]]]:

    if feature_dict is None:
        feature_dict = initialize_feature_dictionary(per_dim_stats)

    nonzero_count = 0
    dim_count = 0

    results: dict[str, BoardResultsConfig] = {}

    for config in configs:
        board_tracker = torch.zeros(config.num_rows, config.num_cols).tolist()
        per_class_dict = {key: 0 for key in range(0, get_num_classes(config))}

        results[config.custom_board_state_function.__name__] = BoardResultsConfig(
            per_class_dict=per_class_dict,
            board_tracker=board_tracker,
        )

    for dim in tqdm(per_dim_stats, total=len(per_dim_stats), desc="Processing chess pgn strings"):
        dim_count += 1
        if dim_count >= max_dims:
            break

        decoded_tokens = per_dim_stats[dim]["decoded_tokens"]
        activations = per_dim_stats[dim]["activations"]
        # If the dim doesn't have at least minimum_number_of_activations firing activations, skip it
        if activations[minimum_number_of_activations][-1].item() == 0:
            continue
        nonzero_count += 1

        inputs = ["".join(string) for string in decoded_tokens]
        inputs = inputs[:top_k]

        count = 0

        chess_boards = [
            chess_utils.pgn_string_to_board(pgn, allow_exception=True) for pgn in inputs
        ]

        for config in configs:

            config_name = config.custom_board_state_function.__name__

            # See function definitions for jaxtyped shapes
            one_hot_list = chess_utils.chess_boards_to_state_stack(chess_boards, device, config)
            one_hot_list = chess_utils.mask_initial_board_states(one_hot_list, device, config)
            averaged_one_hot = chess_utils.get_averaged_states(one_hot_list)
            common_indices = chess_utils.find_common_states(averaged_one_hot, threshold)

            if any(len(idx) > 0 for idx in common_indices):  # if at least one square matches
                results[config_name].pattern_match_count += 1
                average_input_length = sum(len(pgn) for pgn in inputs) / len(inputs)
                results[config_name].total_average_length += average_input_length

                if notebook_usage:
                    for i, pgn in enumerate(inputs):
                        if i >= 10:
                            break
                        print(pgn)

                common_board_state = torch.zeros(
                    config.num_rows,
                    config.num_cols,
                    get_num_classes(config),
                    device=device,
                    dtype=torch.int8,
                )

                for idx in zip(*common_indices):
                    results[config_name].board_tracker[idx[0]][idx[1]] += 1
                    results[config_name].per_class_dict[idx[2].item()] += 1
                    results[config_name].average_matches_per_dim += 1

                    common_board_state[idx[0], idx[1], idx[2]] = 1
                    if notebook_usage:
                        print(f"Dim: {dim}, Index: {idx}")

                feature_info = {
                    "name": config.custom_board_state_function.__name__,
                    "max_activation": activations[0][-1].item(),
                    "board_state": common_board_state,
                }

                feature_dict[dim].append(feature_info)

    for config in configs:
        config_name = config.custom_board_state_function.__name__
        match_count = results[config_name].pattern_match_count
        results[config_name].dim_count = dim_count
        results[config_name].nonzero_count = nonzero_count
        results[config_name].board_tracker = results[config_name].board_tracker
        if match_count > 0:
            results[config_name].total_average_length /= match_count
            results[config_name].average_matches_per_dim /= match_count

    if verbose:
        for config in configs:
            config_name = config.custom_board_state_function.__name__
            pattern_match_count = results[config_name].pattern_match_count
            total_average_length = results[config_name].total_average_length
            print(f"\n{config_name} Results:")
            print(
                f"Out of {dim_count} features, {nonzero_count} had at least {minimum_number_of_activations} activations."
            )
            print(
                f"{pattern_match_count} features matched on all top {top_k} inputs for our board to state function {config_name}"
            )
            print(
                f"The average length of inputs of pattern matching features was {total_average_length}"
            )

            if config.num_rows == 8:
                board_tracker = results[config_name].board_tracker
                print(f"\nThe following square states had the following number of occurances:")
                for key, count in results[config_name].per_class_dict.items():
                    print(f"Index: {key}, Count: {count}")

                print(f"\nHere are the most common squares:")
                board_tracker = torch.tensor(board_tracker).flip(0)
                print(board_tracker)  # torch.tensor has a cleaner printout

    return results, feature_dict

In our first SAE, which has L0 of 25, we have many features that contain these shared board states. For example, in the PGN strings for Dim 18, all pgn strings contain the following 8 identical board squares:

;1.e4 e5 2.Bc4 Nf6 3.d3 c6 4.Nf3 d5 5.exd5 cxd5 6.Bb3 e4 7.dxe4 Nxe4 8.O-O Bc5 9.Qxd5 Qxd5 10.Bxd5 Nf6 11.Bb3 O-O 12.

;1.e4 c5 2.Nf3 Nc6 3.d4 cxd4 4.Nxd4 e6 5.Nc3 a6 6.Nb3 Bb4 7.Bd2 d5 8.exd5 exd5 9.Bd3 Nf6 10.O-O O-O 11.

;1.e4 c5 2.Nf3 d6 3.d4 cxd4 4.Nxd4 Nf6 5.Nc3 a6 6.Bd3 g6 7.O-O Bg7 8.Nf3 O-O 9.Bg5 Nc6 10.Qd2 

;1.e4 e5 2.Nf3 Nc6 3.Bb5 Nge7 4.Bxc6 Nxc6 5.d3 d6 6.O-O Be7 7.h3 O-O 8.b3 a6 9.Bb2 f5 10.exf5 Bxf5 11.Nc3 

;1.e4 e5 2.Nf3 Nc6 3.Bb5 Nge7 4.Bxc6 Nxc6 5.d3 d6 6.O-O Be7 7.h3 O-O 8.b3 a6 9.Bb2 f5 10.exf5 Bxf5 11.Nc3 Bg5 12.Qe2 

;1.e4 c5 2.c3 Nc6 3.Nf3 g6 4.Bb5 Bg7 5.O-O d6 6.d4 cxd4 7.cxd4 Nf6 8.d5 a6 9.Ba4 b5 10.dxc6 bxa4 11.Qxa4 O-O 12.Nc3 B

;1.e4 e5 2.Nf3 Nc6 3.d4 exd4 4.Nxd4 Bc5 5.Be3 Nf6 6.Nxc6 bxc6 7.Bxc5 Nxe4 8.Be3 Rb8 9.Bd3 Qe7 10.O-O O-O 11.Nc3 Nf6 12.Bg

;1.e4 c5 2.Nf3 d6 3.d4 cxd4 4.Nxd4 Nf6 5.Nc3 a6 6.Bd3 g6 7.O-O Bg7 8.Nf3 O-O 9.Bg5 Nc6 10.Qd2 B

;1.e4 e5 2.Nf3 Nf6 3.Nxe5 d6 4.Nf3 Nxe4 5.d4 d5 6.Bd3 Be7 7.O-O O-O 8.c4 c6 9.Nc3 Nxc3 10.bxc3 Bg

;1.e4 e5 2.Nf3 Nc6 3.Bb5 Nge7 4.Bxc6 Nxc6 5.d3 d6 6.O-O Be7 7.h3 O-O 8.b3 a6 9.Bb2 f5 10.exf5 Bxf5 11.Nc3 Bg5 12.Qe2 Bh6 13.Ne4 

Dim: 18, Index: (tensor(0), tensor(4), tensor(6))

Dim: 18, Index: (tensor(0), tensor(5), tensor(10))

Dim: 18, Index: (tensor(0), tensor(6), tensor(12))

Dim: 18, Index: (tensor(0), tensor(7), tensor(6))

Dim: 18, Index: (tensor(7), tensor(4), tensor(6))

Dim: 18, Index: (tensor(7), tensor(5), tensor(2))

Dim: 18, Index: (tensor(7), tensor(6), tensor(0))

Dim: 18, Index: (tensor(7), tensor(7), tensor(6))

In [None]:
prev_top_k = top_k
top_k = 30
results, feature_dict1 = board_analysis(
    per_dim_stats1, top_k, top_k, 50, 0.99, [chess_utils.piece_config], device="cpu", notebook_usage=True, verbose=True
)
top_k = prev_top_k

In [None]:
print(feature_dict1)

In [None]:
prev_top_k = top_k
top_k = 10
board_analysis(
    per_dim_stats1, top_k, top_k, 5, 0.99, [chess_utils.piece_config], device="cpu", notebook_usage=True, verbose=True
)
top_k = prev_top_k

In [None]:
board_analysis(
    per_dim_stats1, top_k, top_k, 5, 0.99, [chess_utils.piece_config], device="cpu", notebook_usage=True, verbose=True
)
top_k = prev_top_k

For our first SAE, with L0 of 25, we have around 10% of features matching one of our syntax match filters (~200 out of 2048).

In [None]:
syntax_analysis(per_dim_stats1, top_k, top_k, max_dims, chess_utils.find_num_indices, verbose=True)
print()
syntax_analysis(per_dim_stats1, top_k, top_k, max_dims, chess_utils.find_spaces_indices, verbose=True)
print()
syntax_analysis(per_dim_stats1, top_k, top_k, max_dims, chess_utils.find_dots_indices, verbose=True)

In contrast, for the second SAE with an L0 of ~2000, 75% of 1561 features match the `find_num_indices` filter!

In [None]:
syntax_analysis(per_dim_stats2, top_k, top_k, max_dims, chess_utils.find_num_indices, verbose=True)
print()
syntax_analysis(per_dim_stats2, top_k, top_k, max_dims, chess_utils.find_spaces_indices, verbose=True)
print()
syntax_analysis(per_dim_stats2, top_k, top_k, max_dims, chess_utils.find_dots_indices, verbose=True)

For board analysis on our first autoencder, we find that 50% of 2000 features match our `piece_config` filter. We also print the number of matches per class:

The following square states had the following number of occurances:

Index: 0, Count: 27

Index: 1, Count: 11

Index: 2, Count: 10

Index: 3, Count: 12

Index: 4, Count: 126

Index: 5, Count: 344

Index: 6, Count: 1781

Index: 7, Count: 439

Index: 8, Count: 178

Index: 9, Count: 44

Index: 10, Count: 24

Index: 11, Count: 6

Index: 12, Count: 37

Unsurprisingly, most matches are for idx 6, the blank class. Many of these are potentially false positives. Understanding this further is probably important. We could potentially maybe compare to common board state stastics in real chess games (which we could gather using this repo).

We also print the match count per square:

Here are the most common squares:

        [  0, 126,  19,  31,  12,  17,  69,  28],

        [  8,   2,  90, 194, 160,   1,   9,   3],

        [  6,   2,  63,  35,  54,  73,  11,   2],

        [  4,  13,  26, 125,  89,   0,   0,   4],

        [  2,   0,  41, 166, 203,   9,   1,   0],

        [  1,   6,  31,   9,  15, 136,  11,   1],

        [  4,   8,  45, 272, 349,  10,  10,   2],

        [  1, 120,  26,  22,  27,  56, 137,  42]])

One potential measure of quality could be a more even spread of pattern matches per square or over classes.

In [None]:
board_analysis(
    per_dim_stats1, top_k, top_k, max_dims, 0.99, [chess_utils.piece_config], device="cpu", verbose=True
)

In contrast, only 436 out of 1561 features in our second autoencoder match a `piece_config` filter. This seems concerning! 75% of features are syntax features, rather than true underlying semantic features.

Obviously, this second autoencoder has a pretty terrible L0. It would be interesting to do more analysis to see what this distribution looks like for more reasonable L0s.

In [None]:
board_analysis(
    per_dim_stats2, top_k, top_k, max_dims, 0.99, [chess_utils.piece_config], device="cpu", verbose=True
)

Our first autoencoder finds 11 features matching `pin_config`, while the second only has 1 feature match for `pin_config`.

In [None]:
board_analysis(per_dim_stats1, top_k, top_k, 2500, 0.99, [chess_utils.pin_config], verbose=True)

In [None]:
board_analysis(per_dim_stats2, top_k, top_k, 2500, 0.99, [chess_utils.pin_config], verbose=True)

We can also pass in a list of Configs for around a 2x speedup.

In [None]:
board_analysis(per_dim_stats1, top_k, top_k, 2500, 0.99, [chess_utils.threat_config, chess_utils.check_config], verbose=True)

We can use this to get a dictionary for every feature in a dictionary of which filters it matches.

In [None]:
feature_dict1 = initialize_feature_dictionary(per_dim_stats1)
feature_dict2 = initialize_feature_dictionary(per_dim_stats2)

board_results, feature_dict1 = board_analysis(
    per_dim_stats1,
    top_k,
    top_k,
    2500,
    0.99,
    [
        chess_utils.pin_config,
        chess_utils.piece_config,
        chess_utils.threat_config,
        chess_utils.check_config,
    ],
    verbose=True,
)

In [None]:
for i, key in enumerate(feature_dict1):
    print(key)
    print(feature_dict1[key])

    if i > 5:
        break