#Set-up

In [1]:
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# set seeds
import random
import numpy as np
import torch

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)

set_seed(42)

# Load NT model

In [3]:
"loading smallest nucleotide transformer (50m params)"


from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

num_params = 50 ## default 50

# Import the tokenizer and the model
tokenizer_nt = AutoTokenizer.from_pretrained(f"InstaDeepAI/nucleotide-transformer-v2-{num_params}m-multi-species", trust_remote_code=True)
model_nt = AutoModelForMaskedLM.from_pretrained(f"InstaDeepAI/nucleotide-transformer-v2-{num_params}m-multi-species", trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Load and preprocess addgene dataset

In [4]:
import pandas as pd


# Constants
TEST_DATA_PATH = '/content/drive/MyDrive/NOO_paper/Datasets/WorldWide/BLAST_geac_ext_169k_val_random.csv'
TRAIN_DATA_PATH = '/content/drive/MyDrive/NOO_paper/Datasets/WorldWide/BLAST_geac_ext_169k_train_random.csv'
INFREQUENT_THRESHOLD = 10

def split_test_data(test_data):
    """Split test data into input and target variables."""
    y_test = test_data['nations']
    x_test = test_data[['sequence']]
    return x_test, y_test

def replace_infrequent_labels(labels, threshold=INFREQUENT_THRESHOLD):
    """Identify and replace infrequent labels."""
    label_counts = labels.value_counts()
    infrequent_labels = label_counts[label_counts < threshold].index
    return labels.replace(infrequent_labels, 'infrequent')

def map_labels_to_integers(labels):
    """Map labels to integers."""
    unique_labels = labels.unique()
    return {label: int(i) for i, label in enumerate(unique_labels)}

def without_US(data):
    """Filter out rows where the nation is 'UNITED STATES'."""
    data_wo_US = data[data['nations'] != 'UNITED STATES']
    data_wo_US.reset_index(drop=True, inplace=True)

    data_w_US = data[data['nations'] == 'UNITED STATES']
    data_w_US.reset_index(drop=True, inplace=True)
    return data_wo_US, data_w_US

def US_vs_them(labels):
    """Categorize labels into 'UNITED STATES' and 'NON US'."""
    return labels.apply(lambda x: x if x == 'UNITED STATES' else 'NON US')

def pad_sequence(seq, length, pad_char='N'):
    """Pad sequences to the specified length with the given character."""
    return seq.ljust(length, pad_char)[:length]

# Load data
train_data = pd.read_csv(TRAIN_DATA_PATH)
test_data = pd.read_csv(TEST_DATA_PATH)

print(f'test_data shape: {test_data.shape}')

# Remove US
# train_data, train_data_US = without_US(train_data)
# test_data, test_data_US = without_US(test_data)

print(f'test_data shape: {test_data.shape}')

# Split data
x_train, y_train = train_data[['sequence']], train_data['nations']
x_test, y_test = split_test_data(test_data)

print(f'test_data shape: {y_test.shape}')
print(f'x_train shape: {x_train.shape}')
print(f'y_train shape: {y_train.shape}')

# Combine labels from train and test datasets
processed_labels = pd.concat([y_train, y_test], axis=0, ignore_index=True)
label_to_int = map_labels_to_integers(processed_labels)


# map labels to integers
y_train = y_train.map(label_to_int)
y_test = y_test.map(label_to_int)

print(f'y_test shape: {y_test.shape}')


# reset indices before concat
x_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
x_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

df_train = pd.concat([x_train, y_train], axis=1)
df_val = pd.concat([x_test, y_test], axis=1)

print(f'test_data shape: {test_data.shape}')


# Filter out sequences shorter than min_length and clean them
min_length = 0
df_train = df_train[df_train['sequence'].str.len() > min_length]
df_val = df_val[df_val['sequence'].str.len() > min_length]

print(f'test_data shape: {test_data.shape}')


# Ensure indices are reset correctly
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)

# Display the split data
print("Train Data Shape:", df_train.shape)
print("Validation Data Shape:", df_val.shape)


test_data shape: (15551, 4)
test_data shape: (15551, 4)
test_data shape: (15551,)
x_train shape: (93306, 1)
y_train shape: (93306,)
y_test shape: (15551,)
test_data shape: (15551, 4)
test_data shape: (15551, 4)
Train Data Shape: (93306, 2)
Validation Data Shape: (15551, 2)


# Set-up & Load SAE

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

cfg = {
    "seed": 49,
    "batch_size": 4096*6,
    "buffer_mult": 384,
    "lr": 5e-5,
    "num_tokens": tokenizer_nt.vocab_size,
    "d_model": 512,
    "l1_coeff": 1e-1,
    "beta1": 0.9,
    "beta2": 0.999,
    "dict_mult": 8, # hidden_d = d_model * dict_mult
    "seq_len": 512,
    "d_mlp": 512,
    "enc_dtype":"fp32",
    "remove_rare_dir": False,
    "total_training_steps": 10000,
    "lr_warm_up_steps": 1000,
    "device": "cuda"
}
cfg["model_batch_size"] = 64
cfg["buffer_size"] = cfg["batch_size"] * cfg["buffer_mult"]
cfg["buffer_batches"] = cfg["buffer_size"] // cfg["seq_len"]

DTYPES = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}

class AutoEncoder(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # HP-choices
        d_hidden = cfg["d_mlp"] * cfg["dict_mult"]
        d_mlp = cfg["d_mlp"]
        self.l0_coeff = cfg.get("l0_coeff", 5)
        self.threshold = cfg.get("activation_threshold", 0.3)
        # Temperature for sigmoid approximation
        self.temperature = cfg.get("temperature", 1.0)
        dtype = DTYPES[cfg["enc_dtype"]]
        torch.manual_seed(cfg["seed"])

        self.W_enc = nn.Parameter(torch.nn.init.kaiming_uniform_(torch.empty(d_mlp, d_hidden, dtype=dtype)))
        self.W_dec = nn.Parameter(torch.nn.init.kaiming_uniform_(torch.empty(d_hidden, d_mlp, dtype=dtype)))
        self.b_enc = nn.Parameter(torch.zeros(d_hidden, dtype=dtype))
        self.b_dec = nn.Parameter(torch.zeros(d_mlp, dtype=dtype))
        self.W_dec.data[:] = self.W_dec / self.W_dec.norm(dim=-1, keepdim=True)

        self.d_hidden = d_hidden
        self.to("cuda") if torch.cuda.is_available() else self.to("cpu")

    def get_continuous_l0(self, x):
        """
        Compute continuous relaxation of L0 norm using sigmoid
        This provides useful gradients unlike the discrete L0
        """
        # Shifted sigmoid to approximate step function
        return torch.sigmoid((x.abs() - self.threshold) / self.temperature)

    def forward(self, x):
        # encoding and decoding of input vec
        x_cent = x - self.b_dec
        pre_acts = x_cent @ self.W_enc + self.b_enc
        acts = F.relu(pre_acts)

        # Compute continuous L0 approximation before thresholding
        l0_proxy = self.get_continuous_l0(acts)

        # Apply hard threshold for forward pass --- This is actually jumprelu (I think!)
        acts_sparse = (acts.abs() > self.threshold).float() * acts
        x_reconstruct = acts_sparse @ self.W_dec + self.b_dec

        # L2 Loss (Reconstruction Loss)
        l2_loss = F.mse_loss(x_reconstruct.float(), x.float(), reduction='none')
        l2_loss = l2_loss.sum(-1)
        l2_loss = l2_loss.mean()

        # Normalized MSE for reporting
        nmse = torch.norm(x - x_reconstruct, p=2) / torch.norm(x, p=2)

        # Continuous L0 loss (using sigmoid approximation)
        l0_loss = l0_proxy.sum(dim=1).mean()

        # Total Loss: reconstruction + sparsity
        loss = l2_loss + self.l0_coeff * l0_loss

        # For monitoring: true L0 count (not used in optimization)
        true_l0 = (acts_sparse.float().abs() > 0).float().sum(dim=1).mean()

        # For monitoring: L1 loss
        l1_loss = acts_sparse.float().abs().sum(-1).mean()

        return loss, x_reconstruct, acts_sparse, l2_loss, nmse, l1_loss, true_l0

    @torch.no_grad()
    def remove_parallel_component_of_grads(self):
        W_dec_normed = self.W_dec / self.W_dec.norm(dim=-1, keepdim=True)
        W_dec_grad_proj = (self.W_dec.grad * W_dec_normed).sum(-1, keepdim=True) * W_dec_normed
        self.W_dec.grad -= W_dec_grad_proj



sae_model = AutoEncoder(cfg)
sae_res = AutoEncoder(cfg)
sae_l10 = AutoEncoder(cfg)


## v3: closer to JumpRelu paper

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

cfg = {
    "seed": 49,
    "batch_size": 4096*6,
    "buffer_mult": 384,
    "lr": 5e-5,
    "num_tokens": tokenizer_nt.vocab_size,
    "d_model": 512,
    "l1_coeff": 1e-1,
    "beta1": 0.9,
    "beta2": 0.999,
    "dict_mult": 8, # hidden_d = d_model * dict_mult
    "seq_len": 512,
    "d_mlp": 512,
    "enc_dtype":"fp32",
    "remove_rare_dir": False,
    "total_training_steps": 10000,
    "lr_warm_up_steps": 1000,
    "device": "cuda"
}


import torch
import torch.nn as nn
import torch.nn.functional as F

DTYPES = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}

class JumpReLUSAE(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # HP-choices
        d_hidden = cfg["d_mlp"] * cfg["dict_mult"]
        d_mlp = cfg["d_mlp"]
        self.l0_coeff = cfg.get("l0_coeff", 5)
        initial_threshold = cfg.get("activation_threshold", 0.001)
        self.epsilon = cfg.get("epsilon", 0.001)  # kernel width for STE
        dtype = DTYPES[cfg["enc_dtype"]]
        torch.manual_seed(cfg["seed"])

        # Initialize W_enc first
        self.W_enc = nn.Parameter(torch.nn.init.kaiming_uniform_(torch.empty(d_mlp, d_hidden, dtype=dtype)))
        # Initialize W_dec as transpose of W_enc, but as a separate parameter
        self.W_dec = nn.Parameter(self.W_enc.transpose(0, 1).clone())  # clone() is important to make it independent
        self.b_enc = nn.Parameter(torch.zeros(d_hidden, dtype=dtype))
        self.b_dec = nn.Parameter(torch.zeros(d_mlp, dtype=dtype))

        # Per-feature learnable thresholds (θᵢ)
        self.thresholds = nn.Parameter(torch.ones(d_hidden, dtype=dtype) * initial_threshold)

        # Normalize decoder weights as mentioned in the paper
        self.W_dec.data[:] = self.W_dec / self.W_dec.norm(dim=-1, keepdim=True)

        self.d_hidden = d_hidden
        self.to("cuda")

    def rectangle_kernel(self, z):
        """Rectangle kernel function K(z) as described in the paper"""
        return (z.abs() <= 0.5).float()

    def jumprelu_forward(self, pre_acts):
        """Forward pass of JumpReLU with per-feature thresholds"""
        # Create mask for values above threshold
        mask = (pre_acts.abs() > self.thresholds.unsqueeze(0)).float()
        return pre_acts * mask

    def jumprelu_backward(self, pre_acts):
        """
        Compute pseudo-derivatives for the backward pass using rectangle kernel
        as described in equations 11 and 12 in the paper
        """
        z_normalized = (pre_acts.unsqueeze(-1) - self.thresholds) / self.epsilon
        kernel_vals = self.rectangle_kernel(z_normalized) / self.epsilon
        return kernel_vals

    def compute_l0_penalty(self, pre_acts):
        """Compute L0 penalty term using Heaviside step function"""
        return torch.sum(pre_acts.abs() > self.thresholds.unsqueeze(0), dim=1).float().mean()

    def forward(self, x):
        # Encoding
        pre_acts = x @ self.W_enc + self.b_enc

        # Apply JumpReLU activation
        acts = self.jumprelu_forward(pre_acts)

        # Decoding
        x_reconstruct = acts @ self.W_dec + self.b_dec

        # Reconstruction Loss (L2)
        l2_loss = F.mse_loss(x_reconstruct, x, reduction='none').sum(-1).mean()

        # L0 sparsity penalty
        l0_loss = self.compute_l0_penalty(pre_acts)

        # Total loss as per equation 9 in the paper
        loss = l2_loss + self.l0_coeff * l0_loss

        # Monitoring metrics
        with torch.no_grad():
            nmse = torch.norm(x - x_reconstruct) / torch.norm(x)
            l1_loss = acts.abs().sum(-1).mean()
            true_l0 = (acts != 0).float().sum(dim=1).mean()

        # Store pre_activations for potential use in backward pass
        self.saved_pre_acts = pre_acts

        return loss, x_reconstruct, acts, l2_loss, nmse, l1_loss, true_l0

    def backward_hook(self, grad):
        """Custom backward hook to implement STE for threshold gradients"""
        kernel_grads = self.jumprelu_backward(self.saved_pre_acts)
        return grad * kernel_grads

    @torch.no_grad()
    def remove_parallel_component_of_grads(self):
        """Maintain decoder weight normalization during training"""
        W_dec_normed = self.W_dec / self.W_dec.norm(dim=-1, keepdim=True)
        W_dec_grad_proj = (self.W_dec.grad * W_dec_normed).sum(-1, keepdim=True) * W_dec_normed
        self.W_dec.grad -= W_dec_grad_proj

    def register_backward_hooks(self):
        """Register backward hooks for STE implementation"""
        self.thresholds.register_hook(self.backward_hook)



sae_model = JumpReLUSAE(cfg)
sae_model.register_backward_hooks()  # Important: Register hooks before training


## Load already-trained SAE

In [None]:
#weights_path = "/content/drive/MyDrive/SAEs_for_Genomics/Weights/nt50m_sae_+40mtokens.pt"

weights_path = "/content/drive/MyDrive/SAEs_for_Genomics/Weights/SAE_NT50_L11.mlp_JumpReLU_v2_MULTISP_150mtokens_E3.pt"
state_dict = torch.load(weights_path, weights_only=True)
sae_model.load_state_dict(state_dict)

<All keys matched successfully>

# Using trained SAE to interpret the NuclTrans

In [6]:
## load custom functions from utils.py

import sys
sys.path.append('//content/drive/MyDrive/SAEs_for_Genomics')

import importlib
import utils
importlib.reload(utils)

<module 'utils' from '//content/drive/MyDrive/SAEs_for_Genomics/utils.py'>

In [None]:
val_seqs = df_val['sequence'].tolist()
val_tokens = tokenizer_nt(val_seqs, max_length=512, padding='max_length', truncation=True, return_tensors="pt")

## Analysing Rare Features (copied & adapted)

For each feature we can get the frequency at which it's non-zero (per token, averaged across a bunch of batches), and plot a histogram

In [None]:
@torch.no_grad()
def get_freqs(num_batches=20, local_encoder=None):
    if local_encoder is None:
        local_encoder = encoder

    # initialise frequency counters to 0 for all hidden neurons
    act_freq_scores = torch.zeros(4096, dtype=torch.float32).cuda()
    total = 0

    for i in range(num_batches):
        # prepare batch of tokens to input to the model
        tokens = val_tokens[i*cfg['model_batch_size']:(i+1)*cfg['model_batch_size']]

        # run model on batch of tokens
        #_, cache = model.run_with_cache(tokens, stop_at_layer=1, names_filter=utils.get_act_name("post", 0))
        mlp_act = utils.get_layer_activations(model_nt, tokens['input_ids'].cuda(), tokens['attention_mask'].cuda())
        mlp_act = mlp_act[0] # unnest

        # extract mlp activations and reshape for SAE
        mlp_act = mlp_act.reshape(-1, d_mlp)

        # normalise using same approach as for traning (optional)



        # input the acts into an SAE, get the SAEs hidden acts
        loss, x_reconstruct, hidden, l2_loss, nmse, l1_loss, true_l0 = local_encoder(mlp_act) ## acts is the second/third output -> 1/2
        act_freq_scores += (hidden > 0).sum(0) # increase counter if act > 0
        total+=hidden.shape[0]

    act_freq_scores /= total # turn counts into frequencies

    # calc and print number of never activated SAE units
    num_dead = (act_freq_scores==0).float().mean()
    print("Num dead", num_dead)

    return act_freq_scores


In [None]:
d_model = cfg["d_model"]
d_mlp = cfg["d_mlp"]
model = model_nt.cuda()

sae_model.cuda()
sae_model.eval()

freqs = get_freqs(num_batches = 20,
                  local_encoder = sae_model) # what % of time is a hidden unit activated > 0?

NameError: name 'val_tokens' is not defined

In [None]:
# how many sae units are rarely activated?

rare_T = 1e-4 #

print(f'Of {d_model*cfg["dict_mult"]} hidden SAE units, {sum(freqs < rare_T).item()} are very rarely activated')	#


# how many features are dense i.e. activate very often?

often_T = 0.3
print(f'Of {d_model*cfg["dict_mult"]} hidden SAE units, {sum(freqs > often_T).item()} are activated very often')	#

In [None]:
import plotly.express as px

# Add 1e-9 so that dead features show up as log_freq -9
log_freq = (freqs + 10**-9).log10()
log_freq = log_freq.cpu().detach().numpy()

px.histogram(log_freq, title="Log Frequency of Features", histnorm='percent')

In [None]:
import numpy as np

# Get indices where freqs is not 0
mask = freqs != 0
indices = torch.where(mask)[0]
print(indices)

# Save indices to a file
np.save('non_rare_feature_indices.npy', indices.cpu().numpy())

In [None]:
encoder = sae_model # just renaming for simplicity


# Q: why encoder (as opposed to decoder) matrix?

is_rare = freqs < 1e-4 # get bool mask
rare_enc = encoder.W_enc[:, is_rare] # get cols from enc matrix
rare_mean = rare_enc.mean(-1) # average these cols

# cosine similarity of rare features to average rare feature and plot
cosine_sim = rare_mean @ encoder.W_enc / rare_mean.norm() / encoder.W_enc.norm(dim=0)

# move to cpu
cosine_sim = cosine_sim.cpu().detach().numpy()
is_rare = is_rare.cpu().detach().numpy()

px.histogram(cosine_sim,
             title="Cosine Sim with Average Rare Feature",
             color=is_rare,
             labels={"color": "is_rare", "count": "percent", "value": "cosine_sim"},
             marginal="box", histnorm="percent", barmode='overlay')

## Loading test-sequence w annotations

In [7]:
import pandas as pd
import torch
from transformers import AutoTokenizer

# Load annotations of val sequences produced by pLannotate
folder_path = '/content/drive/MyDrive/SAEs_for_Genomics/Annotated_seqs/annotations_of_second_ind_rand_3000_seqs.csv'
df_annotated_new = pd.read_csv(folder_path)

# Add 'valseq' to seq_id col
df_annotated_new['seq_id'] = 'valseq_' + df_annotated_new['seq_id'].astype(str)

# Get the sequences whose annotations we just loaded
seq_ids = list(set(df_annotated_new['seq_id']))
# Seq ids are strings of the form 'valseq_{int}'- how can I extract just int?
seq_ids = [int(seq_id.split('_')[1]) for seq_id in seq_ids]
seq_ids_new = sorted(seq_ids)

seq_new = df_val['sequence'].iloc[seq_ids].tolist()

# Tokenize the sequences
tokens_new = tokenizer_nt(seq_new, max_length=512, padding='max_length', truncation=True, return_tensors="pt")

Load another eval dataset

In [8]:
## compare annotations with larger df_annotated
path = '/content/drive/MyDrive/SAEs_for_Genomics/Annotated_seqs/annotat_val_seq0to3000.csv'
df_annotated_3k_old = pd.read_csv(path)

# Get the sequences whose annotations we just loaded
seq_ids = list(set(df_annotated_3k_old['seq_id']))
# Seq ids are strings of the form 'valseq_{int}'- how can I extract just int?
seq_ids = [int(seq_id.split('_')[1]) for seq_id in seq_ids]
seq_ids = sorted(seq_ids)

seq = df_val['sequence'].iloc[seq_ids].tolist()

# Tokenize the sequences
tokens_old = tokenizer_nt(seq, max_length=512, padding='max_length', truncation=True, return_tensors="pt")

Load a third eval dataset

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer

## compare annotations with larger df_annotated
path = '/content/drive/MyDrive/SAEs_for_Genomics/Annotated_seqs/annotations_of_third_3000_seqs.csv'
df_annotated_3k_3rd = pd.read_csv(path)

df_annotated_3k_3rd['seq_id'] = 'valseq_' + df_annotated_3k_3rd['seq_id'].astype(str)

# Get the sequences whose annotations we just loaded
seq_ids = list(set(df_annotated_3k_3rd['seq_id']))

# Seq ids are strings of the form 'valseq_{int}'- how can I extract just int?
seq_ids = [int(seq_id.split('_')[1]) for seq_id in seq_ids]
seq_ids = sorted(seq_ids)

seq = df_val['sequence'].iloc[seq_ids].tolist()

# Tokenize the sequences
tokens_3rd = tokenizer_nt(seq, max_length=512, padding='max_length', truncation=True, return_tensors="pt")

## Create df of all tokens with annotation

### skip for N >= 1000

In [None]:
# Create a table that lists each token in the sequences alongside its annotation(s)

token_df = utils.make_token_df_new(
                      tokens = tokens_3rd['input_ids'].squeeze(),
                      tokenizer = tokenizer_nt,
                      df_annotated = df_annotated_3k_3rd,
                      seq_ids = seq_ids,
                      len_prefix = 2, ## choice: what should these be?
                      len_suffix = 2,
                      nucleotides_per_token = 6, # particular to this model
                      descriptor_col = 'Feature' # values: Feature, Type, Description
)
token_df

# save token_df
token_df.to_csv(f'/content/drive/MyDrive/SAEs_for_Genomics/Annotated_seqs/token_df_val_3k_3rd.csv', index=False)

### and load directly

In [9]:
# load token_df for >= 1000 seqs
token_df_new = pd.read_csv('/content/drive/MyDrive/SAEs_for_Genomics/Annotated_seqs/token_df_val_3000randseqs.csv')
token_df_old = pd.read_csv('/content/drive/MyDrive/SAEs_for_Genomics/Annotated_seqs/token_df_val_seq0to3000.csv')

### Running SAE

Let's go and investigate a non rare feature.

We start by getting the SAE activations for (all) token in our dataset

In [10]:
d_model = cfg["d_model"]
d_mlp = cfg["d_mlp"]
num_layer = 11
batch_size = 64

tokens = tokens_new

# Calculate batch information
total_tokens = tokens['input_ids'].shape[0] * tokens['input_ids'].shape[1]
num_batches = (total_tokens + batch_size - 1) // batch_size

all_latents = []

# Ensure models are in eval mode
sae_model.eval()
model_nt.eval()

for i in range(num_batches):
   start_idx = i * batch_size
   end_idx = min((i + 1) * batch_size, total_tokens)

   # Reshape tokens for current batch
   batch_input_ids = tokens['input_ids'][start_idx:end_idx].cuda()
   batch_attention_mask = tokens['attention_mask'][start_idx:end_idx].cuda()

   with torch.no_grad():
         # Get MLP activations
          mlp_act = utils.get_layer_activations(model_nt.cuda(),
                                                batch_input_ids,
                                                batch_attention_mask,
                                                layer_N=num_layer)
          mlp_act = mlp_act[0].reshape(-1, d_mlp)

          # Forward pass through SAE
          loss, x_reconstruct, latents, l2_loss, nmse, l1_loss, true_l0 = sae_model(mlp_act)
          all_latents.append(latents)

# Combine results, move to cpu before
all_latents = [x.cpu() for x in all_latents]
combined_latents_actually_old = torch.cat(all_latents, dim=0).cpu()
torch.cuda.empty_cache()

In [11]:
latent_id = 989 # or set particular int value in range 0, 4095

# we avoid modifying token_df directly as its very time-consuming to reload if we mess it up
token_df_copy = token_df_new.copy()

# get the activation value for the N-th unit in the SAE for each input in batch
hidden_act_feature_id = combined_latents_actually_old[:, latent_id] # N = feature_id

# add this to the dataframe
token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()

# sort to show the most activating tokens on top, add colours
token_df_copy.sort_values(f"latent-{latent_id}-act", ascending=False).head(20
                                                                           ).style.background_gradient("coolwarm")


Unnamed: 0,seq_id,token_pos,tokens,context,token_annotations,context_annotations,e-value annotation,percentage match,latent-989-act
659273,1288,329,AACAGC,GATCCCCGGCAA |AACAGC| ATTCCAGGTATT,['PuroR'],['PuroR'],[0.],[99.334],8.415592
753091,1471,451,GCTTTT,GCCCGGTACCCA |GCTTTT| GTTCCCTTTAGT,['LSU rRNA bacteria'],['LSU rRNA bacteria'],[0.],[100.],6.22356
289688,565,408,CAAGGA,CGATAAGGACTA |CAAGGA| CGACGATGACAA,['AmpR'],['AmpR'],[0.],[99.768],6.110253
368120,718,504,,||,['special token: '],['dCas9'],,,6.085686
1080959,2114,127,CAAGAT,ACACTTGATAAT |CAAGAT| TTAAATGGTAAA,[],['mEmerald'],[],[],5.900992
659070,1288,126,TGCTAT,CTAATACTGAAA |TGCTAT| ACCCAGCAGATG,['EF-1α promoter'],['EF-1α promoter'],[0.],[99.915],5.754336
659378,1288,434,ATCTGC,CGCCTCGACATC |ATCTGC| CCAGATGCGAAG,"['WPRE', 'P']","['WPRE', 'P']",[0.00e+000 7.12e-149],[100. 97.4],5.540457
973306,1902,506,AGTTTG,TCGCCAGTTAAT |AGTTTG| CGCAACGTTGTT,[],[],[],[],5.514151
772058,1508,474,,||,['special token: '],"['RRE', 'env']",,,5.44254
278207,543,191,CGAACC,TAAACCTTGGTA |CGAACC| CATCTACCTCGG,[],['MRL1'],[],[],5.385663


try shufflign

In [None]:
d_model = cfg["d_model"]
d_mlp = cfg["d_mlp"]
num_layer = 11
batch_size = 64

tokens = tokens_new

# Calculate total number of sequences and tokens
num_sequences = tokens['input_ids'].shape[0]
seq_length = tokens['input_ids'].shape[1]
total_tokens = num_sequences * seq_length

# Create random permutation for shuffling
shuffle_idx = torch.randperm(num_sequences)

# Shuffle both input ids and attention mask
shuffled_input_ids = tokens['input_ids'][shuffle_idx]
shuffled_attention_mask = tokens['attention_mask'][shuffle_idx]

# Calculate number of batches
num_batches = (total_tokens + batch_size - 1) // batch_size

all_latents = []
all_acts = []

# Ensure models are in eval mode
sae_model.eval()
model_nt.eval()

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, total_tokens)

    # Get current batch
    batch_input_ids = shuffled_input_ids[start_idx:end_idx].cuda()
    batch_attention_mask = shuffled_attention_mask[start_idx:end_idx].cuda()

    with torch.no_grad():
        # Get MLP activations
        mlp_act = utils.get_layer_activations(model_nt.cuda(),
                                            batch_input_ids,
                                            batch_attention_mask,
                                            layer_N=num_layer)
        mlp_act = mlp_act[0].reshape(-1, d_mlp)
        all_acts.append(mlp_act)

        # Forward pass through SAE
        loss, x_reconstruct, latents, l2_loss, nmse, l1_loss, true_l0 = sae_model(mlp_act)
        all_latents.append(latents)

# Combine results, move to cpu
all_latents = [x.cpu() for x in all_latents]
combined_latents = torch.cat(all_latents, dim=0).cpu()

# Unshuffle the results to maintain original order
unshuffled_indices = torch.argsort(shuffle_idx)
combined_latents = combined_latents.view(num_sequences, seq_length, -1)[unshuffled_indices]
combined_latents = combined_latents.reshape(-1, combined_latents.size(-1))

torch.cuda.empty_cache()

In [None]:
latent_id = 989 # or set particular int value in range 0, 4095

# we avoid modifying token_df directly as its very time-consuming to reload if we mess it up
token_df_copy_2 = token_df_new.copy()

# get the activation value for the N-th unit in the SAE for each input in batch
hidden_act_feature_id = combined_latents[:, latent_id] # N = feature_id

# add this to the dataframe
token_df_copy_2[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()

# sort to show the most activating tokens on top, add colours
token_df_copy_2.sort_values(f"latent-{latent_id}-act", ascending=False).head(20

                                                                             ).style.background_gradient("coolwarm")

We can now sort and display the top tokens that activate the hidden SAE unit


## Auto-searching monosemantic latents

1. Searching *functional* SAE Latents

In [None]:
def safe_get_annotations(ann_entry):
    if isinstance(ann_entry, str):
        try:
            return eval(ann_entry)
        except:
            return []
    return ann_entry  # already a list

N_latents = 4096
latent_dict = {}
# Then modify the analysis:
for latent_id in range(N_latents):
    hidden_act_feature_id = combined_latents_old[:, latent_id]
    token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()

    most_activating_tokens = token_df_copy.sort_values(f"latent-{latent_id}-act", ascending=False).head(20)

    # Skip if any activations are 0
    if (most_activating_tokens[f"latent-{latent_id}-act"] == 0).any():
        continue

    annotations = [safe_get_annotations(ann) for ann in most_activating_tokens['token_annotations']]

    if annotations:
        annotation_counts = {}
        for ann_list in annotations:
            for ann in ann_list:
                annotation_counts[ann] = annotation_counts.get(ann, 0) + 1

        common_annotations = {ann for ann, count in annotation_counts.items()
                            if count >= 10 and ann not in {'special token: <cls>', 'special token: <pad>'}}

        if common_annotations:
            latent_dict[latent_id] = common_annotations
            print(f"\nLatent {latent_id} appears to detect: {common_annotations}")
            print("Top 20 activating tokens and their annotations:")
            for _, row in most_activating_tokens.iterrows():
                print(f"Token: {row['tokens']}, Annotations: {safe_get_annotations(row['token_annotations'])}, "
                      f"Activation: {row[f'latent-{latent_id}-act']:.3f}")


Latent 22 appears to detect: {'C9orf85'}
Top 20 activating tokens and their annotations:
Token: CTCCGC, Annotations: ['C9orf85'], Activation: 18.726
Token: CTCCGC, Annotations: ['C9orf85', 'hGH poly(A) signal'], Activation: 18.590
Token: CTCCGC, Annotations: ['AR'], Activation: 18.153
Token: CTCCGC, Annotations: ['C9orf85', 'large T antigen'], Activation: 17.767
Token: TCCGCC, Annotations: ['hGH poly(A) signal', 'C9orf85'], Activation: 17.682
Token: GACCTC, Annotations: [], Activation: 17.680
Token: TGCAGC, Annotations: ['mvaS', 'C9orf85'], Activation: 16.847
Token: TCTCCG, Annotations: ['hGH poly(A) signal', 'C9orf85'], Activation: 16.507
Token: TCCGCC, Annotations: ['cat', 'PDK intron'], Activation: 16.250
Token: TCCGCC, Annotations: ['BRL1'], Activation: 16.158
Token: TTTACC, Annotations: ['C9orf85'], Activation: 15.866
Token: TCTCCG, Annotations: ['SV40 ori', 'C9orf85', 'SELP_SV40'], Activation: 15.640
Token: TCTCCG, Annotations: ['hGH poly(A) signal'], Activation: 15.124
Token: T

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 276 appears to detect: {'AmpR'}
Top 20 activating tokens and their annotations:
Token: CTTACC, Annotations: ['AmpR', 'f1 ori'], Activation: 10.255
Token: CTTACC, Annotations: ['AmpR'], Activation: 10.095
Token: CTTACC, Annotations: ['Dendra2', 'AmpR'], Activation: 10.010
Token: CTTACC, Annotations: [], Activation: 9.988
Token: CTTACC, Annotations: ['NeoR/KanR'], Activation: 9.987
Token: CTTACC, Annotations: ['AmpR'], Activation: 9.979
Token: CTTACC, Annotations: ['AmpR'], Activation: 9.928
Token: CTTACC, Annotations: ['AmpR'], Activation: 9.912
Token: CTTACC, Annotations: [], Activation: 9.881
Token: CTTACC, Annotations: ['AmpR', 'FBXW5'], Activation: 9.859
Token: CTTACC, Annotations: [], Activation: 9.844
Token: CTTACC, Annotations: ['f1 ori', 'AmpR'], Activation: 9.836
Token: CTTACC, Annotations: [], Activation: 9.796
Token: CTTACC, Annotations: [], Activation: 9.783
Token: CTTACC, Annotations: [], Activation: 9.779
Token: CTTACC, Annotations: [], Activation: 9.761
Token: CTT

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 316 appears to detect: {'ori'}
Top 20 activating tokens and their annotations:
Token: GCCACC, Annotations: ['ori'], Activation: 14.685
Token: GCCACC, Annotations: [], Activation: 14.663
Token: GCCACC, Annotations: [], Activation: 14.396
Token: GCCACC, Annotations: ['P', 'ori', 'WPRE'], Activation: 14.366
Token: GCCACC, Annotations: [], Activation: 14.282
Token: GCCACC, Annotations: [], Activation: 14.190
Token: GCCACC, Annotations: [], Activation: 14.089
Token: GCCACC, Annotations: ['FLPo'], Activation: 13.959
Token: GCCACC, Annotations: ['gag'], Activation: 13.926
Token: GCCACC, Annotations: ['ori'], Activation: 13.902
Token: GCCACC, Annotations: ['ori'], Activation: 13.893
Token: GCCACC, Annotations: ['ori'], Activation: 13.883
Token: GCCACC, Annotations: ['KRT18'], Activation: 13.871
Token: GCCACC, Annotations: ['LEU2', 'ori'], Activation: 13.865
Token: GCCACC, Annotations: ['ori'], Activation: 13.813
Token: GCCACC, Annotations: ['ori'], Activation: 13.590
Token: GCCACC, Ann

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 361 appears to detect: {'C9orf85'}
Top 20 activating tokens and their annotations:
Token: CGATTC, Annotations: ['hGH poly(A) signal'], Activation: 20.952
Token: CGTTTC, Annotations: ['RNF8'], Activation: 20.134
Token: CGATTC, Annotations: ['C9orf85', 'hGH poly(A) signal'], Activation: 19.683
Token: CGATTC, Annotations: ['C9orf85', 'hGH poly(A) signal'], Activation: 19.496
Token: GGTTCA, Annotations: ['AR'], Activation: 11.039
Token: TTCTCC, Annotations: ['hGH poly(A) signal'], Activation: 10.756
Token: GGTTCA, Annotations: ['C9orf85', 'large T antigen'], Activation: 10.584
Token: TTCTCC, Annotations: ['hGH poly(A) signal', 'C9orf85'], Activation: 10.527
Token: TTCTCC, Annotations: ['SV40 ori', 'C9orf85', 'SELP_SV40'], Activation: 10.131
Token: GGTTCA, Annotations: ['C9orf85', 'hGH poly(A) signal'], Activation: 10.087
Token: GGTTCA, Annotations: ['C9orf85'], Activation: 10.020
Token: GATTCT, Annotations: [], Activation: 10.009
Token: TGATCT, Annotations: ['hGH poly(A) signal'], 

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 393 appears to detect: {'C9orf85'}
Top 20 activating tokens and their annotations:
Token: GCTCAC, Annotations: ['C9orf85'], Activation: 33.275
Token: GCTCAC, Annotations: ['C9orf85'], Activation: 31.454
Token: GCTCAC, Annotations: ['AR'], Activation: 31.158
Token: GCTCAC, Annotations: ['C9orf85', 'hGH poly(A) signal'], Activation: 30.670
Token: GCTCAC, Annotations: ['mvaS', 'C9orf85'], Activation: 29.197
Token: CTCACT, Annotations: ['BRL1'], Activation: 24.464
Token: CTCACT, Annotations: ['hGH poly(A) signal', 'C9orf85'], Activation: 24.375
Token: CTCACT, Annotations: ['cat', 'PDK intron'], Activation: 23.958
Token: CTCACT, Annotations: ['C9orf85', 'int'], Activation: 21.832
Token: CTCACT, Annotations: [], Activation: 20.034
Token: CTCACT, Annotations: ['ori'], Activation: 18.682
Token: CTTACT, Annotations: ['C9orf85'], Activation: 17.025
Token: GGCTCA, Annotations: ['hGH poly(A) signal', 'C9orf85'], Activation: 12.379
Token: TCACTG, Annotations: ['C9orf85', 'hGH poly(A) signal

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 411 appears to detect: {'PuroR'}
Top 20 activating tokens and their annotations:
Token: CGCGCA, Annotations: ['PuroR'], Activation: 11.539
Token: CGCGCA, Annotations: [], Activation: 11.528
Token: CGCGCA, Annotations: ['PuroR'], Activation: 11.527
Token: CGCGCA, Annotations: ['PuroR'], Activation: 11.524
Token: CGCGCA, Annotations: ['CMV intron', 'PuroR', 'T7 promoter'], Activation: 11.523
Token: CGCGCA, Annotations: [], Activation: 11.522
Token: CGCGCA, Annotations: [], Activation: 11.522
Token: CGCGCA, Annotations: ['PuroR'], Activation: 11.522
Token: CGCGCA, Annotations: ['TIAM1', 'PuroR'], Activation: 11.520
Token: CGCGCA, Annotations: ['PuroR'], Activation: 11.517
Token: CGCGCA, Annotations: ['GFPmut3'], Activation: 11.514
Token: CGCGCA, Annotations: ['λ tL3 terminator'], Activation: 11.505
Token: CGCGCA, Annotations: ['PuroR', 'ori'], Activation: 11.504
Token: CGCGCA, Annotations: [], Activation: 11.501
Token: CGCGCA, Annotations: ['PuroR'], Activation: 11.479
Token: CGCG

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 429 appears to detect: {'AmpR'}
Top 20 activating tokens and their annotations:
Token: ATAGTT, Annotations: ['DCLK1'], Activation: 11.840
Token: ATAGTT, Annotations: ['AmpR', 'R1A_SARS2'], Activation: 11.668
Token: ATAGTT, Annotations: [], Activation: 11.378
Token: ATAGTT, Annotations: ['AmpR', 'PuroR'], Activation: 11.332
Token: ATAGTT, Annotations: [], Activation: 11.247
Token: ATAGTT, Annotations: [], Activation: 11.199
Token: ATAGTT, Annotations: ['AmpR'], Activation: 11.173
Token: ATAGTT, Annotations: ['AmpR'], Activation: 11.136
Token: ATAGTT, Annotations: ['bom'], Activation: 11.076
Token: ATAGTT, Annotations: [], Activation: 11.057
Token: ATAGTT, Annotations: ['cyp102A1', 'AmpR'], Activation: 10.992
Token: ATAGTT, Annotations: ['AmpR', 'PuroR'], Activation: 10.971
Token: ATAGTT, Annotations: ['AmpR'], Activation: 10.969
Token: ATAGTT, Annotations: ['AmpR'], Activation: 10.861
Token: ATAGTT, Annotations: ['bla(M)'], Activation: 10.852
Token: ATAGTT, Annotations: ['AmpR']

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 454 appears to detect: {'CMV enhancer'}
Top 20 activating tokens and their annotations:
Token: AGGGAC, Annotations: ['CMV enhancer', 'CMV promoter'], Activation: 11.041
Token: AGGGAC, Annotations: ['CMV promoter', 'CMV enhancer'], Activation: 11.004
Token: AGGGAC, Annotations: ['CMV enhancer', 'CMV promoter'], Activation: 10.404
Token: AGGGAC, Annotations: ['CMV promoter'], Activation: 10.136
Token: AGGGAC, Annotations: ['CMV enhancer'], Activation: 10.053
Token: AGGGAC, Annotations: ['CMV promoter', 'SRα promoter', "5' LTR (truncated)", 'CMV enhancer'], Activation: 10.039
Token: AGGGAC, Annotations: ['CMV enhancer'], Activation: 9.988
Token: AGGGAC, Annotations: [], Activation: 9.948
Token: AGGGAC, Annotations: ['CMV promoter', 'CMV enhancer'], Activation: 9.934
Token: AGGGAC, Annotations: ['CMV promoter'], Activation: 9.929
Token: AGGGAC, Annotations: ['CMV promoter', 'CMV enhancer'], Activation: 9.852
Token: AGGGAC, Annotations: ['CMV enhancer'], Activation: 9.829
Token: AGG

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 500 appears to detect: {'AmpR'}
Top 20 activating tokens and their annotations:
Token: <cls>, Annotations: ['special token: <cls>'], Activation: 30.305
Token: <cls>, Annotations: ['special token: <cls>'], Activation: 28.695
Token: CTCTTG, Annotations: ['AmpR'], Activation: 23.564
Token: GTTTAC, Annotations: [], Activation: 22.006
Token: <cls>, Annotations: ['special token: <cls>'], Activation: 21.923
Token: CTTTTC, Annotations: ['AmpR'], Activation: 21.514
Token: ACGGGA, Annotations: ['AmpR'], Activation: 17.947
Token: GCTGTT, Annotations: ['AmpR'], Activation: 17.824
Token: TTCGGG, Annotations: ['AmpR'], Activation: 17.594
Token: CTGAGA, Annotations: ['AmpR'], Activation: 17.370
Token: GCTAAG, Annotations: ['bGH poly(A) signal'], Activation: 17.322
Token: CATTAG, Annotations: ['CHRM4'], Activation: 17.143
Token: <pad>, Annotations: ['special token: <pad>'], Activation: 16.998
Token: GATATA, Annotations: ['CFP'], Activation: 16.979
Token: GCGACC, Annotations: ['AmpR'], Activati

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()



Latent 506 appears to detect: {'C9orf85'}
Top 20 activating tokens and their annotations:
Token: TGCAAT, Annotations: ['C9orf85'], Activation: 18.589
Token: TGCAAT, Annotations: ['AR'], Activation: 17.683
Token: TGCAAT, Annotations: ['C9orf85', 'hGH poly(A) signal'], Activation: 16.740
Token: TGCAAT, Annotations: ['C9orf85', 'large T antigen'], Activation: 16.563
Token: TGCAGC, Annotations: ['mvaS', 'C9orf85'], Activation: 13.327
Token: CTGGAC, Annotations: ['mvaS', 'C9orf85'], Activation: 3.391
Token: GCAACC, Annotations: [], Activation: 2.913
Token: TGCAAT, Annotations: ['ori', 'C9orf85'], Activation: 2.786
Token: CGTAAT, Annotations: ['RNF8'], Activation: 2.687
Token: CACAAT, Annotations: ['C9orf85', 'hGH poly(A) signal'], Activation: 1.950
Token: TCTCCG, Annotations: ['hGH poly(A) signal', 'C9orf85'], Activation: 1.636
Token: CACAAT, Annotations: ['C9orf85', 'hGH poly(A) signal'], Activation: 1.569
Token: CTGCAA, Annotations: ['hGH poly(A) signal', 'C9orf85'], Activation: 1.285
To

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 806 appears to detect: {'SV40 promoter'}
Top 20 activating tokens and their annotations:
Token: TATGCA, Annotations: ['SV40 promoter', 'AR'], Activation: 14.370
Token: ATGCAA, Annotations: ['SV40 promoter'], Activation: 13.521
Token: TATGCA, Annotations: [], Activation: 13.411
Token: ATGCAA, Annotations: ['SV40 promoter', 'PDK intron'], Activation: 13.347
Token: ATGCAA, Annotations: ['SV40 promoter'], Activation: 13.342
Token: ATGCAA, Annotations: ['WPRE', 'SV40 promoter', 'P'], Activation: 13.137
Token: ATGCAA, Annotations: [], Activation: 13.131
Token: TATGCA, Annotations: ['SV40 promoter', 'ZNF571'], Activation: 13.055
Token: TATGCA, Annotations: [], Activation: 13.024
Token: ATGCAA, Annotations: [], Activation: 12.998
Token: TATGCA, Annotations: ['SV40 promoter'], Activation: 12.887
Token: TATGCA, Annotations: ['SV40 promoter', 'AmpR promoter'], Activation: 12.867
Token: ATGCAA, Annotations: ['WPRE', 'SV40 promoter', 'P'], Activation: 12.853
Token: ATGCAA, Annotations: ['SV

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 829 appears to detect: {'AR'}
Top 20 activating tokens and their annotations:
Token: CATCAT, Annotations: [], Activation: 21.132
Token: CATCAT, Annotations: [], Activation: 20.720
Token: CCACCA, Annotations: [], Activation: 20.591
Token: ATCATC, Annotations: [], Activation: 20.589
Token: CAGCAG, Annotations: ['AR'], Activation: 19.951
Token: GCAGCA, Annotations: ['AR'], Activation: 19.941
Token: GCAGCA, Annotations: ['AR'], Activation: 19.822
Token: GCAGCA, Annotations: ['AR'], Activation: 19.773
Token: CAGCAG, Annotations: ['AR'], Activation: 19.750
Token: CATCAT, Annotations: [], Activation: 19.700
Token: GCAGCA, Annotations: ['AR', 'tdTomato'], Activation: 19.694
Token: CATCAT, Annotations: ['capTEV™'], Activation: 19.675
Token: GCAGCA, Annotations: ['AR', 'tdTomato'], Activation: 19.632
Token: CAGCAG, Annotations: ['AR'], Activation: 19.619
Token: CAGCAG, Annotations: ['AR'], Activation: 19.596
Token: GCAGCA, Annotations: ['AR'], Activation: 19.566
Token: GCAGCA, Annotation

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 856 appears to detect: {'TERF2'}
Top 20 activating tokens and their annotations:
Token: AAAAAA, Annotations: [], Activation: 22.745
Token: AAAAAA, Annotations: [], Activation: 21.794
Token: AAAAAA, Annotations: ['purD'], Activation: 19.611
Token: AAAAAA, Annotations: [], Activation: 19.396
Token: AAAAAA, Annotations: [], Activation: 18.815
Token: AAAAAA, Annotations: [], Activation: 18.685
Token: AAAAAA, Annotations: [], Activation: 18.574
Token: AAAAAA, Annotations: [], Activation: 18.460
Token: AAAAAA, Annotations: ['TERF2'], Activation: 18.399
Token: AAAAAA, Annotations: ['purD'], Activation: 18.296
Token: AAAAAA, Annotations: ['TERF2'], Activation: 18.229
Token: AAAAAA, Annotations: ['TERF2'], Activation: 17.744
Token: AAAAAA, Annotations: ['TERF2'], Activation: 17.597
Token: AAAAAA, Annotations: ['TERF2'], Activation: 17.443
Token: AAAAAA, Annotations: ['TERF2'], Activation: 17.339
Token: AAAAAA, Annotations: [], Activation: 17.208
Token: AAAAAA, Annotations: ['TERF2'], Ac

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 877 appears to detect: {'CMV enhancer'}
Top 20 activating tokens and their annotations:
Token: GGCAGT, Annotations: ['CMV enhancer'], Activation: 19.949
Token: GGCAGT, Annotations: ['CMV enhancer'], Activation: 19.835
Token: GGCAGT, Annotations: ['pheT', 'CMV enhancer'], Activation: 19.607
Token: GGCAGT, Annotations: ['CMV enhancer'], Activation: 19.438
Token: GGCAGT, Annotations: ['TLX3'], Activation: 19.413
Token: GGCAGT, Annotations: [], Activation: 19.334
Token: GGCAGT, Annotations: ['CMV enhancer'], Activation: 19.312
Token: GGCAGT, Annotations: ['CMV enhancer'], Activation: 19.266
Token: GGCAGT, Annotations: [], Activation: 19.244
Token: GGCAGT, Annotations: ['CMV enhancer'], Activation: 19.206
Token: GGCAGT, Annotations: ['CMV enhancer'], Activation: 19.185
Token: GGCAGT, Annotations: [], Activation: 19.178
Token: GGCAGT, Annotations: ['int'], Activation: 19.176
Token: GGCAGT, Annotations: ['kanMX', 'CMV enhancer'], Activation: 19.166
Token: GGCAGT, Annotations: ['CMV en

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 928 appears to detect: {'hGH poly(A) signal'}
Top 20 activating tokens and their annotations:
Token: CATGCA, Annotations: ['C9orf85', 'hGH poly(A) signal'], Activation: 21.722
Token: CATGCA, Annotations: ['hGH poly(A) signal'], Activation: 21.270
Token: CATGCA, Annotations: ['C9orf85', 'hGH poly(A) signal'], Activation: 20.905
Token: CGTGAA, Annotations: ['hGH poly(A) signal'], Activation: 16.952
Token: CGTGAA, Annotations: ['hGH poly(A) signal'], Activation: 16.521
Token: CGTGAA, Annotations: [], Activation: 16.411
Token: CATGAG, Annotations: [], Activation: 15.051
Token: CGCCTG, Annotations: ['RNF8'], Activation: 14.627
Token: TGCATG, Annotations: ['S'], Activation: 12.007
Token: TGTGAG, Annotations: [], Activation: 9.384
Token: GTTTCA, Annotations: ['C9orf85', 'hGH poly(A) signal'], Activation: 6.140
Token: CACAAT, Annotations: ['hGH poly(A) signal'], Activation: 5.713
Token: GTTTCA, Annotations: ['hGH poly(A) signal'], Activation: 5.351
Token: CGTAAT, Annotations: ['RNF8'],

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 947 appears to detect: {'C9orf85'}
Top 20 activating tokens and their annotations:
Token: AGCGAT, Annotations: ['AR'], Activation: 28.338
Token: AGCGAT, Annotations: ['C9orf85', 'large T antigen'], Activation: 28.171
Token: AGCGAT, Annotations: ['C9orf85', 'hGH poly(A) signal'], Activation: 27.591
Token: AGCGAT, Annotations: ['C9orf85'], Activation: 26.930
Token: AGCCAT, Annotations: ['mvaS', 'C9orf85'], Activation: 25.139
Token: AAGCGA, Annotations: ['hGH poly(A) signal', 'C9orf85'], Activation: 18.282
Token: AAGCGA, Annotations: ['SV40 ori', 'C9orf85', 'SELP_SV40'], Activation: 18.093
Token: AAATGA, Annotations: ['S'], Activation: 18.075
Token: AAGCGA, Annotations: ['hGH poly(A) signal'], Activation: 17.432
Token: AGTGAT, Annotations: [], Activation: 16.049
Token: GCGTTT, Annotations: ['C9orf85', 'int'], Activation: 12.545
Token: GCGATT, Annotations: [], Activation: 11.692
Token: CGTTTC, Annotations: ['RNF8'], Activation: 11.636
Token: CGATTC, Annotations: ['hGH poly(A) signa

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()



Latent 957 appears to detect: {'SmR'}
Top 20 activating tokens and their annotations:
Token: ACATCA, Annotations: ['SmR'], Activation: 21.902
Token: ACATCA, Annotations: ['SmR'], Activation: 20.891
Token: ACATCA, Annotations: [], Activation: 20.084
Token: ACATCA, Annotations: ['SmR'], Activation: 19.871
Token: ACATCA, Annotations: [], Activation: 19.649
Token: ACATCA, Annotations: ['SmR'], Activation: 19.639
Token: ACATCA, Annotations: [], Activation: 19.484
Token: ACATCA, Annotations: [], Activation: 19.204
Token: ACATCA, Annotations: ['SmR'], Activation: 17.735
Token: ACATCA, Annotations: ['aadA', 'RT86_ECOLX'], Activation: 16.330
Token: ACATCA, Annotations: [], Activation: 16.312
Token: ACATCA, Annotations: ['SmR'], Activation: 15.441
Token: AAGTCA, Annotations: ['SmR'], Activation: 13.295
Token: AAGTCA, Annotations: ['SmR'], Activation: 13.064
Token: AAGTCA, Annotations: [], Activation: 12.631
Token: AAGTCA, Annotations: ['SmR'], Activation: 12.411
Token: AAGTCA, Annotations: ['Sm

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()



Latent 958 appears to detect: {'C9orf85'}
Top 20 activating tokens and their annotations:
Token: AGCCAT, Annotations: ['mvaS', 'C9orf85'], Activation: 16.547
Token: GCGATT, Annotations: ['C9orf85'], Activation: 16.460
Token: GCGATT, Annotations: ['hGH poly(A) signal', 'C9orf85'], Activation: 15.153
Token: GCGATT, Annotations: [], Activation: 14.820
Token: GCGATT, Annotations: ['BRL1'], Activation: 14.430
Token: AGCGAT, Annotations: ['C9orf85', 'large T antigen'], Activation: 14.204
Token: AGCGAT, Annotations: ['AR'], Activation: 13.783
Token: GCGATT, Annotations: ['cat', 'PDK intron'], Activation: 13.763
Token: AGCGAT, Annotations: ['C9orf85', 'hGH poly(A) signal'], Activation: 13.479
Token: GCCATT, Annotations: ['ori'], Activation: 13.300
Token: AGCGAT, Annotations: ['C9orf85'], Activation: 13.247
Token: GCGTTT, Annotations: ['C9orf85', 'int'], Activation: 12.816
Token: TCCTGC, Annotations: ['hGH poly(A) signal'], Activation: 12.579
Token: TCCTGC, Annotations: ['RNF8'], Activation: 1

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()



Latent 967 appears to detect: {'CMV IE94 promoter'}
Top 20 activating tokens and their annotations:
Token: CGTCAA, Annotations: ['DNApol-gamma35', 'CMV enhancer', 'CMV IE94 promoter'], Activation: 18.904
Token: CGTCAA, Annotations: [], Activation: 18.764
Token: CGTCAA, Annotations: ['lldR'], Activation: 18.715
Token: CGTCAA, Annotations: ['Act5C', 'CMV IE94 promoter'], Activation: 18.459
Token: CGTCAA, Annotations: ['CMV IE94 promoter'], Activation: 18.389
Token: CGTCAA, Annotations: ['Superfolder GFP'], Activation: 18.286
Token: CGTCAA, Annotations: ['CMV IE94 promoter', 'SV2C'], Activation: 18.161
Token: CGTCAA, Annotations: ['CMV IE94 promoter', 'CMV enhancer'], Activation: 18.120
Token: CGTCAA, Annotations: [], Activation: 18.085
Token: CGTCAA, Annotations: ['lacZ'], Activation: 18.084
Token: CGTCAA, Annotations: ['ZNF470'], Activation: 18.018
Token: CGTCAA, Annotations: ['CMV IE94 promoter', 'EGFP'], Activation: 17.996
Token: CGTCAA, Annotations: [], Activation: 17.974
Token: CGT

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 1025 appears to detect: {'CMV IE94 promoter', 'CMV enhancer'}
Top 20 activating tokens and their annotations:
Token: CCCTAT, Annotations: [], Activation: 12.529
Token: CCCTAT, Annotations: [], Activation: 12.265
Token: CCCTAT, Annotations: ['CMV IE94 promoter', 'CMV enhancer'], Activation: 12.194
Token: CCCTAT, Annotations: ['mClover3'], Activation: 12.127
Token: CCCTAT, Annotations: [], Activation: 12.123
Token: CCCTAT, Annotations: ['CMV enhancer'], Activation: 12.062
Token: CCCTAT, Annotations: ['SUP35', 'CMV IE94 promoter'], Activation: 11.948
Token: CCCTAT, Annotations: ['E', 'CMV enhancer'], Activation: 11.860
Token: CCCTAT, Annotations: ['CMV enhancer', 'CMV IE94 promoter'], Activation: 11.843
Token: CCCTAT, Annotations: ['CMV enhancer', 'gag-pol'], Activation: 11.802
Token: CCCTAT, Annotations: ['avrXa10'], Activation: 11.797
Token: CCCTAT, Annotations: ['CMV enhancer'], Activation: 11.607
Token: CCCTAT, Annotations: ['CMV IE94 promoter'], Activation: 11.568
Token: CCCT

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 1094 appears to detect: {'C9orf85'}
Top 20 activating tokens and their annotations:
Token: GGCACA, Annotations: ['C9orf85'], Activation: 21.211
Token: GGCACA, Annotations: ['C9orf85'], Activation: 20.257
Token: GGCACA, Annotations: ['AR'], Activation: 18.951
Token: GGCACA, Annotations: ['C9orf85', 'hGH poly(A) signal'], Activation: 18.857
Token: GGCACA, Annotations: ['mvaS', 'C9orf85'], Activation: 16.686
Token: GCACAA, Annotations: ['BRL1'], Activation: 14.821
Token: GCACAA, Annotations: ['hGH poly(A) signal', 'C9orf85'], Activation: 14.625
Token: GCACAA, Annotations: ['cat', 'PDK intron'], Activation: 14.317
Token: GCGTAA, Annotations: ['C9orf85', 'int'], Activation: 13.207
Token: GCGCGA, Annotations: ['ori'], Activation: 11.780
Token: TTGCGA, Annotations: [], Activation: 10.483
Token: GGGCAA, Annotations: ['C9orf85'], Activation: 9.985
Token: GCTCAC, Annotations: ['C9orf85'], Activation: 9.713
Token: GCTCAC, Annotations: ['C9orf85'], Activation: 9.466
Token: GCTCAC, Annotati

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 1126 appears to detect: {'MTOR'}
Top 20 activating tokens and their annotations:
Token: T, Annotations: ['P'], Activation: 13.204
Token: N, Annotations: ['MTOR'], Activation: 13.023
Token: N, Annotations: ['attL2'], Activation: 12.826
Token: N, Annotations: ['MTOR'], Activation: 12.692
Token: T, Annotations: [], Activation: 12.560
Token: T, Annotations: ['MTOR'], Activation: 12.529
Token: T, Annotations: ['MTOR'], Activation: 12.495
Token: T, Annotations: [], Activation: 12.361
Token: N, Annotations: ['MTOR', 'lacZ'], Activation: 12.320
Token: <cls>, Annotations: ['special token: <cls>'], Activation: 12.271
Token: N, Annotations: [], Activation: 12.152
Token: N, Annotations: ['MTOR'], Activation: 12.144
Token: G, Annotations: ['attL2'], Activation: 12.139
Token: T, Annotations: ['MTOR'], Activation: 12.106
Token: T, Annotations: ['MTOR', 'trpS'], Activation: 12.080
Token: N, Annotations: ['MTOR', 'trpS'], Activation: 12.037
Token: N, Annotations: ['P'], Activation: 12.013
Token

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()



Latent 1133 appears to detect: {'C9orf85'}
Top 20 activating tokens and their annotations:
Token: GTGCAG, Annotations: ['hGH poly(A) signal', 'C9orf85'], Activation: 15.567
Token: GCACAA, Annotations: ['hGH poly(A) signal', 'C9orf85'], Activation: 15.433
Token: GCACAA, Annotations: ['BRL1'], Activation: 14.746
Token: GCACAA, Annotations: ['cat', 'PDK intron'], Activation: 14.632
Token: GTGCAG, Annotations: ['hGH poly(A) signal'], Activation: 13.317
Token: GTGCAG, Annotations: ['SV40 ori', 'C9orf85'], Activation: 13.070
Token: GCGTAA, Annotations: ['C9orf85', 'int'], Activation: 12.093
Token: GTGCAG, Annotations: ['S'], Activation: 10.924
Token: CAGTGG, Annotations: ['RNF8'], Activation: 10.112
Token: CAGTGG, Annotations: ['hGH poly(A) signal'], Activation: 9.854
Token: CAGTGG, Annotations: ['C9orf85', 'hGH poly(A) signal'], Activation: 9.719
Token: CAGTGG, Annotations: ['C9orf85', 'hGH poly(A) signal'], Activation: 9.589
Token: CTGGAG, Annotations: ['AR'], Activation: 9.055
Token: CTG

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()



Latent 1136 appears to detect: {'AmpR promoter'}
Top 20 activating tokens and their annotations:
Token: GCGGAA, Annotations: ['AmpR promoter'], Activation: 12.301
Token: GCGGAA, Annotations: ['AmpR promoter'], Activation: 12.170
Token: GCGGAA, Annotations: [], Activation: 12.039
Token: GCGGAA, Annotations: ['AmpR promoter'], Activation: 11.839
Token: GCGGAA, Annotations: ['AmpR promoter', 'GST26_SCHJA'], Activation: 11.746
Token: GCGGAA, Annotations: ['AmpR promoter'], Activation: 11.716
Token: GCGGAA, Annotations: ['f1 ori'], Activation: 11.535
Token: GCGGAA, Annotations: ['AmpR promoter', 'gag (truncated)'], Activation: 11.504
Token: GCGGAA, Annotations: ['AmpR promoter'], Activation: 11.484
Token: GCGGAA, Annotations: ['AmpR promoter'], Activation: 11.467
Token: GCGGAA, Annotations: ['AmpR promoter', 'env'], Activation: 11.444
Token: GCGGAA, Annotations: ['AmpR promoter'], Activation: 11.166
Token: GCGGAA, Annotations: ['AmpR promoter'], Activation: 11.147
Token: GCGGAA, Annotation

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()



Latent 1144 appears to detect: {'CMV enhancer'}
Top 20 activating tokens and their annotations:
Token: GATTAT, Annotations: ['MCS', 'CMV enhancer'], Activation: 14.661
Token: GATTAT, Annotations: ['penA', 'lacI', 'CMV enhancer'], Activation: 14.593
Token: ATTGAC, Annotations: ['mTurquoise2', 'CMV enhancer'], Activation: 14.027
Token: GATTAT, Annotations: ['CMV enhancer'], Activation: 13.693
Token: GATTAT, Annotations: ['CMV enhancer'], Activation: 13.496
Token: GATTAT, Annotations: ['CMV enhancer', 'PRCC'], Activation: 13.424
Token: GATTAT, Annotations: ['CMV enhancer', 'CMV intron'], Activation: 13.423
Token: GATTAT, Annotations: ['CMV enhancer'], Activation: 13.390
Token: GATTAT, Annotations: ['lacI'], Activation: 13.350
Token: GATTAT, Annotations: ['CMV enhancer'], Activation: 13.173
Token: GATTAT, Annotations: ['TEF1 promoter'], Activation: 13.037
Token: GATTAT, Annotations: ['3xHA'], Activation: 12.998
Token: GATTAT, Annotations: ['CMV enhancer', 'CAP binding site'], Activation: 

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 1182 appears to detect: {'CMV enhancer'}
Top 20 activating tokens and their annotations:
Token: CATTGA, Annotations: ['YPet', 'CMV enhancer'], Activation: 22.274
Token: CATTGA, Annotations: ['CMV enhancer'], Activation: 22.221
Token: CATTGA, Annotations: ['CMV enhancer', 'Tln1'], Activation: 22.152
Token: CATTGA, Annotations: [], Activation: 22.078
Token: CATTGA, Annotations: ['CMV enhancer', 'CMV enhancer', 'CMV promoter'], Activation: 22.068
Token: CATTGA, Annotations: ['CMV enhancer'], Activation: 22.048
Token: CATTGA, Annotations: ['CMV enhancer'], Activation: 22.041
Token: CATTGA, Annotations: ['CMV enhancer'], Activation: 21.912
Token: CATTGA, Annotations: [], Activation: 21.896
Token: CATTGA, Annotations: ['CMV enhancer'], Activation: 21.883
Token: CATTGA, Annotations: ['CMV enhancer', 'CMV enhancer'], Activation: 21.861
Token: CATTGA, Annotations: ['mCherry', 'CMV enhancer'], Activation: 21.859
Token: CATTGA, Annotations: [], Activation: 21.828
Token: CATTGA, Annotation

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()



Latent 1193 appears to detect: {'Nfkb1'}
Top 20 activating tokens and their annotations:
Token: N, Annotations: ['Nfkb1'], Activation: 23.356
Token: N, Annotations: ['Nfkb1'], Activation: 23.267
Token: N, Annotations: ['TET1'], Activation: 23.185
Token: N, Annotations: ['Nfkb1'], Activation: 22.970
Token: N, Annotations: ['Nfkb1'], Activation: 22.848
Token: N, Annotations: ['Nfkb1'], Activation: 22.840
Token: N, Annotations: ['TET1'], Activation: 22.800
Token: N, Annotations: ['TET1'], Activation: 22.794
Token: N, Annotations: ['TET1'], Activation: 22.789
Token: N, Annotations: ['TET1'], Activation: 22.761
Token: N, Annotations: ['TET1'], Activation: 22.655
Token: N, Annotations: ['Nfkb1'], Activation: 22.559
Token: N, Annotations: ['Nfkb1'], Activation: 22.548
Token: N, Annotations: ['TET1'], Activation: 22.464
Token: N, Annotations: ['Nfkb1'], Activation: 22.441
Token: N, Annotations: ['TET1'], Activation: 22.301
Token: N, Annotations: ['TET1'], Activation: 22.232
Token: N, Annotati

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 1238 appears to detect: {'LTR'}
Top 20 activating tokens and their annotations:
Token: GGAGAA, Annotations: ['UL126'], Activation: 25.111
Token: GCAGTT, Annotations: ['LTR'], Activation: 10.552
Token: AGCAGT, Annotations: [], Activation: 10.345
Token: AGCAGT, Annotations: ['LTR'], Activation: 9.572
Token: AGCAGT, Annotations: [], Activation: 9.559
Token: AGCAGT, Annotations: ['LTR'], Activation: 9.324
Token: GGCAGA, Annotations: [], Activation: 9.224
Token: AGCAGT, Annotations: ['LTR'], Activation: 9.224
Token: AGCAGT, Annotations: ['PDK intron'], Activation: 8.575
Token: AGCAGT, Annotations: ['Abd-B', 'LTR'], Activation: 8.196
Token: GCAGTT, Annotations: ['LTR'], Activation: 7.620
Token: GAATCA, Annotations: ['C9orf85'], Activation: 7.257
Token: GCAGTT, Annotations: ['kanMX'], Activation: 7.056
Token: GCAGTT, Annotations: ['LTR'], Activation: 6.789
Token: CAGTTC, Annotations: ['LTR', 'Zbtb44'], Activation: 6.610
Token: CAGTTC, Annotations: [], Activation: 6.610
Token: GCAGTT, 

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()



Latent 1240 appears to detect: {'CMV enhancer'}
Top 20 activating tokens and their annotations:
Token: TGACCT, Annotations: [], Activation: 19.563
Token: TGACCT, Annotations: ['CMV enhancer'], Activation: 19.524
Token: TGACCT, Annotations: ['CMV enhancer', 'CMV promoter'], Activation: 19.409
Token: TGACCT, Annotations: ['CMV enhancer', 'insA', 'IS1'], Activation: 19.296
Token: TGACCT, Annotations: [], Activation: 19.259
Token: TGACCT, Annotations: ['CMV enhancer'], Activation: 19.205
Token: TGACCT, Annotations: ['CMV enhancer'], Activation: 19.189
Token: TGACCT, Annotations: ['CMV enhancer'], Activation: 19.173
Token: TGACCT, Annotations: ['CMV enhancer'], Activation: 19.159
Token: TGACCT, Annotations: ['CMV enhancer'], Activation: 19.045
Token: TGACCT, Annotations: [], Activation: 19.013
Token: TGACCT, Annotations: [], Activation: 18.952
Token: TGACCT, Annotations: ['CMV enhancer'], Activation: 18.934
Token: TGACCT, Annotations: [], Activation: 18.917
Token: TGACCT, Annotations: ['or

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()



Latent 1251 appears to detect: {'CMV enhancer'}
Top 20 activating tokens and their annotations:
Token: CTTTCC, Annotations: [], Activation: 12.613
Token: CTTTCC, Annotations: ['CMV promoter', 'CMV enhancer'], Activation: 12.473
Token: <cls>, Annotations: ['special token: <cls>'], Activation: 12.386
Token: CTTTCC, Annotations: ['CMV promoter', 'CMV enhancer'], Activation: 12.326
Token: CTTTCC, Annotations: ['ori', 'CMV enhancer', 'CMV promoter'], Activation: 12.283
Token: CTTTCC, Annotations: ['GAL1 promoter', 'CMV enhancer'], Activation: 12.124
Token: CTTTCC, Annotations: ['CMV enhancer', 'CMV promoter'], Activation: 12.101
Token: CTTTCC, Annotations: ['CMV enhancer', 'NUDT22'], Activation: 12.015
Token: CTTTCC, Annotations: [], Activation: 12.014
Token: CTTTCC, Annotations: ['CMV enhancer'], Activation: 12.003
Token: CTTTCC, Annotations: ['CMV promoter', 'CMV enhancer'], Activation: 11.970
Token: CTTTCC, Annotations: ['CMV enhancer', 'ARAF'], Activation: 11.960
Token: CTTTCC, Annotat

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 1265 appears to detect: {'SV40 promoter'}
Top 20 activating tokens and their annotations:
Token: CTCCCC, Annotations: ['SV40 promoter'], Activation: 16.143
Token: CTCCCC, Annotations: [], Activation: 15.914
Token: CTCCCC, Annotations: [], Activation: 15.828
Token: CTCCCC, Annotations: [], Activation: 15.564
Token: CTCCCC, Annotations: ['SV40 promoter'], Activation: 15.529
Token: CTCCCC, Annotations: [], Activation: 15.412
Token: CTCCCC, Annotations: ['SV40 promoter'], Activation: 15.336
Token: CTCCCC, Annotations: [], Activation: 15.272
Token: CTCCCC, Annotations: [], Activation: 15.084
Token: CTCCCC, Annotations: ['IGHG1', 'SV40 promoter'], Activation: 15.070
Token: CTCCCC, Annotations: ['SV40 promoter'], Activation: 14.917
Token: CTCCCC, Annotations: ['SV40 promoter'], Activation: 14.890
Token: CTCCCC, Annotations: ['SV40 promoter'], Activation: 14.879
Token: CTCCCC, Annotations: ['SV40 promoter', 'ZNF571'], Activation: 14.541
Token: CTCCCC, Annotations: [], Activation: 14.38

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 1366 appears to detect: {'CMV promoter'}
Top 20 activating tokens and their annotations:
Token: GGAGTT, Annotations: ['WNT2B', 'CMV promoter'], Activation: 14.747
Token: GGAGTT, Annotations: ['UL126', 'NR1D2'], Activation: 14.694
Token: GGAGTT, Annotations: ['CMV promoter'], Activation: 14.686
Token: GGAGTT, Annotations: ['Txndc9', 'UL126'], Activation: 14.405
Token: GGAGTT, Annotations: ['CMV promoter'], Activation: 14.400
Token: GGAGTT, Annotations: ['UL126', 'CMV promoter'], Activation: 14.397
Token: GGAGTT, Annotations: [], Activation: 14.325
Token: GGAGTT, Annotations: ['icaR'], Activation: 14.323
Token: GGAGTT, Annotations: ['EGFP', 'UL126', 'CMV promoter'], Activation: 14.282
Token: GGAGTT, Annotations: ['UL126'], Activation: 14.257
Token: GGAGTT, Annotations: ['UL126', 'CMV promoter', 'rtTA3'], Activation: 14.243
Token: GGAGTT, Annotations: ['UL126'], Activation: 14.231
Token: GGAGTT, Annotations: ['UL126', 'CMV promoter'], Activation: 14.196
Token: GGAGTT, Annotations:

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 1394 appears to detect: {'EF-1α promoter'}
Top 20 activating tokens and their annotations:
Token: GTTTCC, Annotations: ['kanMX', 'EF-1α promoter', 'EF-1α intron A'], Activation: 11.198
Token: GTTTCC, Annotations: ['EF-1α promoter'], Activation: 11.167
Token: GTTTCC, Annotations: ['TIAM1', 'EF-1α promoter', 'EF-1α intron A'], Activation: 11.146
Token: GTTTCC, Annotations: [], Activation: 10.885
Token: GTTTCC, Annotations: ['EF-1α intron A', 'EF-1α promoter'], Activation: 10.874
Token: GTTTCC, Annotations: ['EF-1α promoter', 'EF-1α intron A'], Activation: 10.865
Token: GTTTCC, Annotations: ['EF-1α promoter', 'EGFP'], Activation: 10.818
Token: GTTTCC, Annotations: ['EF-1α intron A', 'cat'], Activation: 10.800
Token: GTTTCC, Annotations: ['araBAD promoter', 'EF-1α promoter'], Activation: 10.792
Token: GTTTCC, Annotations: ['EF-1α intron A'], Activation: 10.786
Token: GTTTCC, Annotations: ['NXF3'], Activation: 10.773
Token: GTTTCC, Annotations: ['EF-1α promoter'], Activation: 10.772

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 1411 appears to detect: {'SV40 promoter'}
Top 20 activating tokens and their annotations:
Token: ATTAAT, Annotations: ['AmpR', 'penA'], Activation: 11.106
Token: AATTAG, Annotations: ['SV40 promoter'], Activation: 10.683
Token: AATTAG, Annotations: ['SV40 promoter'], Activation: 10.669
Token: AATTAG, Annotations: ['SV40 promoter'], Activation: 10.538
Token: AATTAG, Annotations: ['SV40 promoter'], Activation: 10.354
Token: AATTAG, Annotations: ['SV40 promoter'], Activation: 10.307
Token: AATTAG, Annotations: ['SV40 promoter', 'PDK intron'], Activation: 10.260
Token: AATTAG, Annotations: ['SV40 promoter'], Activation: 10.044
Token: ATTAAT, Annotations: ['UL126'], Activation: 10.041
Token: AATTAG, Annotations: ['SV40 promoter'], Activation: 10.009
Token: ATTAAT, Annotations: [], Activation: 9.969
Token: AATTAG, Annotations: ['SV40 promoter', 'SEC31A'], Activation: 9.896
Token: AATTAG, Annotations: ['SV40 promoter', '5X UAS'], Activation: 9.892
Token: AATTAG, Annotations: ['bla'], 

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()



Latent 1413 appears to detect: {'CMV promoter'}
Top 20 activating tokens and their annotations:
Token: ACTTTC, Annotations: ['CMV promoter', 'UL126', 'M13 ori'], Activation: 15.515
Token: ACTTTC, Annotations: ['CMV promoter'], Activation: 14.837
Token: ACTTTC, Annotations: ['WNT2B', 'CMV promoter'], Activation: 14.760
Token: ACTTTC, Annotations: ['UL126'], Activation: 14.731
Token: ACTTTC, Annotations: ['UL126', 'CMV promoter'], Activation: 14.654
Token: ACTTTC, Annotations: ['CMV promoter', 'Gja1'], Activation: 14.647
Token: ACTTTC, Annotations: ['CMV promoter', 'PuroR'], Activation: 14.604
Token: CTTTCC, Annotations: ['CMV promoter', 'CMV enhancer'], Activation: 14.593
Token: ACTTTC, Annotations: ['CMV promoter', 'UL126', 'ori'], Activation: 14.583
Token: ACTTTC, Annotations: ['CMV enhancer'], Activation: 14.575
Token: ACTTTC, Annotations: ['CMV promoter', 'UL126'], Activation: 14.524
Token: ACTTTC, Annotations: ['UL126', 'NR1D2'], Activation: 14.522
Token: ACTTTC, Annotations: ['CM

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 1445 appears to detect: {'CMV promoter', 'UL126'}
Top 20 activating tokens and their annotations:
Token: TGGGAG, Annotations: ['CMV promoter', 'UL126'], Activation: 21.727
Token: TGGGAG, Annotations: ['UL126', 'CMV promoter', 'AmpR'], Activation: 21.440
Token: TGGGAG, Annotations: ['CMV promoter', 'UL126'], Activation: 21.428
Token: TGGGAG, Annotations: ['cre', 'UL126', 'CMV promoter'], Activation: 21.358
Token: TGGGAG, Annotations: ['CMV promoter', 'lldR'], Activation: 21.337
Token: TGGGAG, Annotations: ['UL126', 'lacI'], Activation: 21.304
Token: TGGGAG, Annotations: [], Activation: 21.266
Token: TGGGAG, Annotations: ['f1 ori'], Activation: 21.264
Token: TGGGAG, Annotations: ['CMV promoter'], Activation: 21.259
Token: TGGGAG, Annotations: ['CMV promoter'], Activation: 21.210
Token: TGGGAG, Annotations: ['GPKOW', 'CMV promoter'], Activation: 21.189
Token: TGGGAG, Annotations: ['UL126'], Activation: 21.177
Token: TGGGAG, Annotations: ['CMV promoter', 'UL126'], Activation: 21.17

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 1475 appears to detect: {'UL126'}
Top 20 activating tokens and their annotations:
Token: TTTGTT, Annotations: ['FBL'], Activation: 13.610
Token: TTTGTT, Annotations: [], Activation: 13.256
Token: TTTGTT, Annotations: ['UL126'], Activation: 13.219
Token: TTTGTT, Annotations: ['CMV promoter', 'UL126'], Activation: 13.091
Token: TTTGTT, Annotations: ['CMV promoter', 'UL126'], Activation: 12.957
Token: TTTGTT, Annotations: ['ybhC', 'CMV promoter', 'UL126'], Activation: 12.563
Token: TTTGTT, Annotations: ['cre', 'UL126', 'CMV promoter'], Activation: 12.538
Token: TTTGTT, Annotations: ['UL126', 'cpCitrine'], Activation: 12.500
Token: TTTGTT, Annotations: ['UL126', 'CMV promoter'], Activation: 12.153
Token: TTTGTT, Annotations: ['CMV promoter', 'UL126'], Activation: 12.110
Token: TTTGTT, Annotations: [], Activation: 12.090
Token: TTTGTT, Annotations: [], Activation: 12.027
Token: TTTGTT, Annotations: ['kanMX', 'UL126'], Activation: 12.022
Token: TTTGTT, Annotations: ['PHGDH'], Activat

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 1499 appears to detect: {'AmpR'}
Top 20 activating tokens and their annotations:
Token: AGAGTA, Annotations: [], Activation: 12.982
Token: AGAGTA, Annotations: ['AmpR'], Activation: 12.903
Token: AGAGTA, Annotations: ['AmpR'], Activation: 12.652
Token: AGAGTA, Annotations: ['ori'], Activation: 12.574
Token: AGAGTA, Annotations: ['AmpR'], Activation: 12.529
Token: AGAGTA, Annotations: ['AmpR'], Activation: 12.478
Token: AGAGTA, Annotations: ['AmpR', 'CHEK1'], Activation: 12.249
Token: AGAGTA, Annotations: ['AmpR'], Activation: 11.990
Token: AGAGTA, Annotations: ['AmpR'], Activation: 11.983
Token: AGAGTA, Annotations: ['AmpR promoter', 'AmpR'], Activation: 11.970
Token: AGAGTA, Annotations: ['CMV enhancer'], Activation: 11.943
Token: AGAGTA, Annotations: [], Activation: 11.907
Token: AGAGTA, Annotations: [], Activation: 11.857
Token: AGAGTA, Annotations: ['TPSD2_PICSI'], Activation: 11.834
Token: AGAGTA, Annotations: ['lacI', 'AmpR'], Activation: 11.825
Token: AGAGTA, Annotations

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()



Latent 1510 appears to detect: {'ori'}
Top 20 activating tokens and their annotations:
Token: TGGCTT, Annotations: ['Htr6', 'ori'], Activation: 10.618
Token: TGGCTT, Annotations: ['CMV promoter', 'ori'], Activation: 10.312
Token: TGGCTT, Annotations: ['dauA', 'RNAI', 'ori'], Activation: 10.295
Token: TGGCTT, Annotations: ['RNAI', 'ori', 'attR1'], Activation: 10.194
Token: TGGCTT, Annotations: ['neo', 'ori'], Activation: 9.941
Token: TGGCTT, Annotations: ['RNAI'], Activation: 9.881
Token: TGGCTT, Annotations: ['RNAI', 'ori'], Activation: 9.879
Token: TGGCTT, Annotations: ['ori'], Activation: 9.859
Token: TGGCTT, Annotations: ['ori'], Activation: 9.845
Token: TGGCTT, Annotations: ['RNAI'], Activation: 9.820
Token: TGGCTT, Annotations: ['SV40 intron'], Activation: 9.783
Token: TGGCTT, Annotations: [], Activation: 9.781
Token: TGGCTT, Annotations: ['U6 promoter'], Activation: 9.749
Token: TGGCTT, Annotations: [], Activation: 9.719
Token: TGGCTT, Annotations: ['SHOX2'], Activation: 9.690
T

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 1535 appears to detect: {'CMV enhancer'}
Top 20 activating tokens and their annotations:
Token: GTCAAT, Annotations: ['HNRNPA2B1'], Activation: 10.425
Token: GTCAAT, Annotations: [], Activation: 10.376
Token: GTCAAT, Annotations: ['CMV enhancer', 'AmpR promoter'], Activation: 10.317
Token: GTCAAT, Annotations: ['CMV promoter', 'int'], Activation: 10.016
Token: GTCAAT, Annotations: ['CMV promoter', 'CMV enhancer'], Activation: 9.954
Token: GTCAAT, Annotations: ['GPR156', 'CMV enhancer'], Activation: 9.869
Token: GTCAAT, Annotations: ['CMV enhancer', 'CMV IE94 promoter'], Activation: 9.865
Token: GTCAAT, Annotations: [], Activation: 9.838
Token: GTCAAT, Annotations: [], Activation: 9.748
Token: TGAATG, Annotations: ['Csf2'], Activation: 9.717
Token: GTCAAT, Annotations: ['kanMX', 'MRL1', 'CMV enhancer'], Activation: 9.634
Token: GTCAAT, Annotations: ['CMV enhancer', 'CMV promoter'], Activation: 9.618
Token: GTCAAT, Annotations: ['CMV promoter', 'CMV enhancer'], Activation: 9.574


  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df


Latent 1589 appears to detect: {'SV40 promoter'}
Top 20 activating tokens and their annotations:
Token: TCTCAA, Annotations: ['SV40 promoter', 'SV40 ori'], Activation: 17.358
Token: TCTCAA, Annotations: [], Activation: 16.277
Token: TCTCAA, Annotations: ['SV40 promoter'], Activation: 16.258
Token: TCTCAA, Annotations: [], Activation: 16.172
Token: TCTCAA, Annotations: ['SV40 promoter', 'AmpR'], Activation: 16.132
Token: TCTCAA, Annotations: ['SV40 promoter'], Activation: 16.116
Token: TCTCAA, Annotations: ['SV40 promoter'], Activation: 15.821
Token: TCAATT, Annotations: ['SV40 promoter'], Activation: 15.750
Token: TCTCAA, Annotations: [], Activation: 15.741
Token: TCTCAA, Annotations: [], Activation: 15.691
Token: TCAATT, Annotations: ['bGH poly(A) signal', 'SV40 promoter'], Activation: 15.622
Token: TCTCAA, Annotations: ['GST26_SCHJA'], Activation: 15.589
Token: TCTCAA, Annotations: ['GST26_SCHJA'], Activation: 15.554
Token: TCTCAA, Annotations: ['SV40 promoter', 'AmpR'], Activation:

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()



Latent 1593 appears to detect: {'TERF2'}
Top 20 activating tokens and their annotations:
Token: AAAAAA, Annotations: [], Activation: 23.724
Token: AAAAAA, Annotations: [], Activation: 23.721
Token: AAAAAA, Annotations: ['TERF2'], Activation: 23.378
Token: AAAAAA, Annotations: ['TERF2'], Activation: 23.355
Token: AAAAAA, Annotations: ['TERF2'], Activation: 23.287
Token: AAAAAA, Annotations: ['TERF2'], Activation: 23.278
Token: AAAAAA, Annotations: ['TERF2'], Activation: 23.240
Token: AAAAAA, Annotations: ['TERF2'], Activation: 22.879
Token: AAAAAA, Annotations: ['TERF2'], Activation: 22.879
Token: AAAAAA, Annotations: ['TERF2'], Activation: 22.878
Token: AAAAAA, Annotations: ['TERF2'], Activation: 22.633
Token: AAAAAA, Annotations: ['TERF2'], Activation: 22.615
Token: GGGGGG, Annotations: ['EF-1α intron A'], Activation: 22.566
Token: GGGGGG, Annotations: ['chicken β-actin promoter'], Activation: 22.250
Token: GGGGGG, Annotations: ['HDAC5', 'chicken β-actin promoter'], Activation: 22.09

  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
  token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()


2. Searching *syntactic* SAE latents

In [None]:
import pandas as pd
from tqdm import tqdm

## here we create a short list of candidate monosemantic latents for **kmers** by looking at the top-50 most activating tokens
## and asking: do at least half of them share a kmer?

def analyze_latent_features_fast(token_df, combined_latents, k=4, n_latents=4096, top_n=10):
    """Optimized version of latent feature analysis"""

    def get_kmers(token, k):
        if not isinstance(token, str) or k <= 0:
            return set()
        token = token.strip()
        if not token or len(token) < k:
            return set()
        return {token[i:i+k] for i in range(len(token) - k + 1)}

    # Convert combined_latents to numpy once
    activations_array = combined_latents.cpu().detach().numpy()

    # Pre-compute valid tokens and their k-mers
    valid_tokens = token_df['tokens'].dropna()
    token_to_kmers = {token: get_kmers(str(token), k) for token in valid_tokens}

    latent_dict = {}
    tokens_array = token_df['tokens'].values

    # Process in batches for better memory usage
    batch_size = 100
    for batch_start in tqdm(range(0, n_latents, batch_size)):
        batch_end = min(batch_start + batch_size, n_latents)

        # Process batch of latents
        batch_activations = activations_array[:, batch_start:batch_end]

        # Find top_n indices for each latent in batch
        top_indices = np.argpartition(-batch_activations, top_n, axis=0)[:top_n]

        # Process each latent in batch
        for i, latent_id in enumerate(range(batch_start, batch_end)):
            # Get tokens for top activations
            top_tokens = tokens_array[top_indices[:, i]]

            # Get k-mer sets for valid tokens
            kmer_sets = [token_to_kmers[token] for token in top_tokens
                        if pd.notna(token) and token in token_to_kmers]

            if kmer_sets:
                common_kmers = set.intersection(*kmer_sets)
                if common_kmers:
                    latent_dict[latent_id] = common_kmers
                    print(f"\nLatent {latent_id} appears to detect: {common_kmers}")

    return latent_dict

kmer_latent_dict_ = analyze_latent_features_fast(token_df, combined_latents, k=4, n_latents=4096, top_n=50) ## set n_latents to 100 to quickly test

In [None]:
#save latent dict as csv file
import pandas as pd

df = pd.DataFrame(list(latent_dict.items()), columns=['latent_id', 'annotation'])
df

# save
df.to_csv('/content/drive/MyDrive/SAEs_for_Genomics/Latent_dict_func_monosem_nt50m_sae_l10_+40mtokens.csv', index=False)

In [None]:
dict_values = latent_dict.values()

# turn list of sets into one large set
flat_set = set.union(*dict_values)
print(flat_set)

## Auto-Searching of MLP

1. Of the MLP neurons are there any that are somewhat monosemantic for a functional annotation?

In [None]:
for latent_id in range(512):
    # we avoid modifying token_df directly as its very time-consuming to reload if we mess it up
    token_df_copy = token_df.copy()

    # get the activation value for the N-th unit in the SAE for each input in batch
    hidden_act_feature_id = mlp_act[:, latent_id] # N = feature_id

    # add this to the dataframe
    token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()

    # print latent_id if the five most activating tokens share a token annotation
    most_activating_tokens = token_df_copy.sort_values(f"latent-{latent_id}-act", ascending=False).head(5)

    # Get annotations as lists
    annotations = most_activating_tokens['token_annotations'].tolist()

    # Check if there's any intersection between all annotation lists
    if annotations:
        # Convert all annotations to sets for intersection
        annotation_sets = [set(ann) for ann in annotations]
        common_annotations = set.intersection(*annotation_sets)
        filtered_annotations = common_annotations - {'special token: <cls>', 'special token: <pad>'}

        if filtered_annotations:  # If there are any shared annotations
            print(f"\nLatent {latent_id} appears to detect: {common_annotations}")
            print("Top 5 activating tokens and their annotations:")
            for _, row in most_activating_tokens.iterrows():
                print(f"Token: {row['tokens']}, Annotations: {row['token_annotations']}, "
                      f"Activation: {row[f'latent-{latent_id}-act']:.3f}")


2. Of the MLP neurons are there any that are somewhat monosemantic for some *syntactic* pattern?

In [None]:
k = 4 ## kmer length
latent_dict = {}

for latent_id in range(512):
    # we avoid modifying token_df directly as its very time-consuming to reload if we mess it up
    token_df_copy = token_df.copy()

    # get the activation value for the N-th unit in the SAE for each input in batch
    hidden_act_feature_id = combined_acts[:, latent_id] # N = feature_id

    # add this to the dataframe
    token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()

    # print latent_id if the five most activating tokens share a token annotation
    most_activating_tokens = token_df_copy.sort_values(f"latent-{latent_id}-act", ascending=False).head(10)


    # Get a set of all kmers for each most activating token
    def get_kmers(token, k):
        if not isinstance(token, str) or k <= 0:
            raise ValueError("Invalid input: token must be string and k must be positive")
        if len(token) < k:
            return set()
        return {token[i:i+k] for i in range(len(token) - k + 1)}

    kmer_sets = [get_kmers(token, k) for token in most_activating_tokens['tokens']]

    # Check if there's any intersection between all kmer sets stored
    if kmer_sets:
        common_kmers = set.intersection(*kmer_sets)
        if common_kmers:  # If there are any shared kmers
            latent_dict[latent_id] = common_kmers
            print(f"\nLatent {latent_id} appears to detect: {common_kmers}")
            print("Top 5 activating tokens and their annotations:")
            for _, row in most_activating_tokens.iterrows():
                print(f"Token: {row['tokens']}, Annotations: {row['token_annotations']}, "
                      f"Activation: {row[f'latent-{latent_id}-act']:.3f}")



In [None]:
latent_id = 188  #np.random.randint(0, 4096) # or set particular int value in this range


# we avoid modifying token_df directly as its very time-consuming to reload if we mess it up
token_df_copy = token_df.copy()

# get the activation value for the N-th unit in the SAE for each input in batch
hidden_act_feature_id = combined_acts[:, latent_id] # N = feature_id

# add this to the dataframe
token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()

# sort to show the most activating tokens on top, add colours
token_df_copy.sort_values(f"latent-{latent_id}-act", ascending=False).head(300).style.background_gradient("coolwarm")


## Calc sensitivity and specificity of SAE latent for Functional or Syntactic Feature

In [None]:
def contains_kmers(tokens: str, kmers: list) -> bool:
    """Check if a token sequence contains any of the kmers in the list"""
    if not isinstance(tokens, str):
        return False
    return any(k in tokens for k in kmers)

def contains_annotations(token_annotation: str, annotations: list) -> bool:
    """Check if a token sequence contains any of the given annotations"""
    if not isinstance(token_annotation, str):
        return False
    return any(annotation in token_annotation for annotation in annotations)

def calculate_stats(df, act_threshold, meaning, check: str):
    """Calculate various statistics about TAG tokens and activations"""

    # Create a function that's partially applied with the specific annotation
    if check == 'kmer':
      check_kmer = lambda x: contains_kmers(x, meaning)
      check_fn = check_kmer
      col = 'tokens'

    elif check == 'annotation':
      check_annotation = lambda x: contains_annotations(x, meaning)
      check_fn = check_annotation
      col = 'token_annotations'

    else: raise ValueError("check must be 'kmer' or 'annotation'")

    # Q1: Fraction of above-threshold activations containing TAG
    above_threshold = df[df[latent_column] > act_threshold]
    above_threshold_tag_fraction = above_threshold[col].apply(check_fn).mean()

    # Q2: Fraction of below-or-equal threshold activations containing TAG
    below_threshold = df[df[latent_column] <= act_threshold]
    below_threshold_tag_fraction = below_threshold[col].apply(check_fn).mean()

    # Q3: Overall fraction of rows containing TAG
    overall_tag_fraction = df[col].apply(check_fn).mean()

    # Q4: Fraction of tokens with positive activation
    positive_activation_fraction = (df[latent_column] > act_threshold).mean()

    # Q5: For rows containing TAG, fraction with positive activation
    tag_rows = df[df[col].apply(check_fn)]
    tag_positive_fraction = (tag_rows[latent_column] > act_threshold).mean()

    return {
        'above_threshold_tag': above_threshold_tag_fraction,
        'below_threshold_tag': below_threshold_tag_fraction,
        'overall_tag': overall_tag_fraction,
        'positive_activation': positive_activation_fraction,
        'tag_positive': tag_positive_fraction
    }

def find_largest_consecutive_tag_sequence(df):
    """Find largest N where top-N rows all contain TAG"""
    sorted_df = df.sort_values(latent_column, ascending=False)
    check_cmv = lambda x: contains_annotations(x, ['CMV enhancer', 'CMV promoter', 'CMV IE94 promoter'])

    for N in range(1, len(df) + 1):
        top_n = sorted_df.head(N)
        if not all(top_n['token_annotations'].apply(check_cmv)):
            return N - 1
    return len(df)

# create empty pd df with column for latent_id, annotation and evidence_for_act_from_tag
columns = ['latent_id', 'annotation', 'evidence_for_act_from_ann', 'evidence_for_ann_from_act', 'precision', 'recall']
df = pd.DataFrame(columns=columns)

using_kmer = False
using_annotation = not using_kmer

# Iterate over latent dict
for latent_id, meaning in latent_dict.items():

    # Calculate all statistics
    act_threshold = 0.0  # Activation threshold
    latent_column = f"latent-{latent_id}-act"


    # get most activating tokens for latent id
    token_df_copy = token_df.copy()
    hidden_act_feature_id = combined_latents_new[:, latent_id] # N = feature_id
    token_df_copy[f"latent-{latent_id}-act"] = hidden_act_feature_id.cpu().detach().numpy()
    token_df_copy.sort_values(f"latent-{latent_id}-act", ascending=False).head(300).style.background_gradient("coolwarm")

    ### input all to calc stats
    if using_annotation:
      annotation = list(meaning)
      stats = calculate_stats(token_df_copy, act_threshold, meaning = annotation, check = 'annotation')

    elif using_kmer:
      kmer_strings = [''.join(kmer) for kmer in meaning]
      stats = calculate_stats(token_df_copy, act_threshold, meaning = kmer_strings, check = 'kmer')



    ## Calculate the posterior to prior odds ratios
    evidence_for_act_from_tag = (stats['tag_positive']/(1-stats['tag_positive'])) / (stats['positive_activation']/(1-stats['positive_activation']))
    evidence_for_tag_from_act = (stats['above_threshold_tag']/(1-stats['above_threshold_tag'])) / (stats['overall_tag']/(1-stats['overall_tag']))


    # only print at least moderately-monosemantic latents
    if min(evidence_for_act_from_tag, evidence_for_tag_from_act) > 20 or max(evidence_for_act_from_tag, evidence_for_tag_from_act)>200:

        print(f"\nLatent {latent_id} appears to detect: {meaning}")

        print(f"Strength of evidence for act > {act_threshold} from {meaning} (as BayesF): {evidence_for_act_from_tag:.3f}")
        print(f"Strength of evidence for {meaning} from act > {act_threshold} (as BayesF): {evidence_for_tag_from_act:.3f}")



            # Print results in a formatted way
        print(f"\n {meaning} Token Analysis Results")
        print("=" * 50)
        print(f"Analysis for activation threshold: {act_threshold}")
        print("-" * 50)
        print(f" P(token annotated with {meaning}):                      {stats['overall_tag']:.3f}")
        print(f" P(token annotated with {meaning}|activation > {act_threshold}):     {stats['above_threshold_tag']:.3f}")
        print(f" P(activation > {act_threshold}):                        {stats['positive_activation']:.3f}")
        print(f" P(activation > {act_threshold}|token annotated with {meaning}):     {stats['tag_positive']:.3f}")


        # add to df
        df.loc[len(df)] = [latent_id, meaning, evidence_for_act_from_tag, evidence_for_tag_from_act, stats['above_threshold_tag'], stats['tag_positive']]

        # Find and print largest consecutive sequence0
        #largest_n = find_largest_consecutive_tag_sequence(token_df_copy)
        #print("-" * 50)
        #print(f"Largest N where top-N rows all contain {annotation}: {largest_n}")

# save df
df.to_csv('/content/drive/MyDrive/SAEs_for_Genomics/Latent_dict_4MER_monosem_nt50m_sae_+40mtokens.csv', index=False)

  evidence_for_tag_from_act = (stats['above_threshold_tag']/(1-stats['above_threshold_tag'])) / (stats['overall_tag']/(1-stats['overall_tag']))
  evidence_for_tag_from_act = (stats['above_threshold_tag']/(1-stats['above_threshold_tag'])) / (stats['overall_tag']/(1-stats['overall_tag']))
  evidence_for_tag_from_act = (stats['above_threshold_tag']/(1-stats['above_threshold_tag'])) / (stats['overall_tag']/(1-stats['overall_tag']))
  evidence_for_tag_from_act = (stats['above_threshold_tag']/(1-stats['above_threshold_tag'])) / (stats['overall_tag']/(1-stats['overall_tag']))
