# Transformer






In [87]:

# Remember to login to wandb!
import sys
import os 
import torch
import matplotlib.pyplot as plt
import copy 
import numpy as np
import gc
import itertools
# Append module directory for imports
parent_dir = os.path.expanduser('../eigenestimation/eigenestimation')

from eigenestimation.evaluation.networks import DrawNeuralNetwork
from eigenestimation.eigenmodel.eigenmodel import EigenModel
from eigenestimation.utils.loss import MSELoss
from eigenestimation.utils.uniform_models import ZeroOutput
from eigenestimation.toy_models.data import GenerateTMSInputs
from eigenestimation.toy_models.parallel_serial_network import CustomMLP
from torch import Tensor
import einops

import figure_names
from datasets import load_dataset
from transformer_lens.utils import tokenize_and_concatenate
from torch.utils.data import DataLoader


## Set up

In [88]:
eigenmodel_path = f"../outputs/eigenmodels/transformer.pt"
eigenmodel = torch.load(eigenmodel_path)['model']
tokenizer = eigenmodel.model.tokenizer
frac_activated = torch.load(eigenmodel_path)['frac_activated']

In [89]:
# Load tinystories data
token_length = 8
dataset = load_dataset('roneneldan/TinyStories', split="validation[:1%]")
X_transformer = tokenize_and_concatenate(dataset, tokenizer, max_length = token_length, add_bos_token=False)['tokens']


In [90]:
circuit_vals = []
X_ordered = []
iters = 5

gc.collect()
torch.cuda.empty_cache()
for X_batch in DataLoader(X_transformer[:1000], batch_size=8, shuffle=True):
    X_ordered.append(X_batch)
        # Compute gradients many times and take the average
    each_circuit_val = torch.zeros(X_batch.shape[0]*X_batch.shape[1], eigenmodel.n_features).to('cuda')
    for _ in range(iters):
        grads = eigenmodel.compute_gradients(X_batch.to('cuda'))
        each_circuit_val = each_circuit_val + abs(eigenmodel(grads))
    circuit_vals.append(each_circuit_val.view(X_batch.shape[0], X_batch.shape[1], eigenmodel.n_features))
circuit_vals = torch.concat(circuit_vals, dim=0)/iters
X_ordered = torch.concat(X_ordered, dim=0)

In [91]:
import torch

bold_idx = tokenizer.encode('***')

for i in range(eigenmodel.n_features):
    if frac_activated[i] < 0.05:
        continue
    
    # Get the absolute values for feature i
    abs_vals = (circuit_vals[..., i])

    # Find the top 5 (b, t) indices for feature i
    top_indices = abs_vals.flatten().argsort(descending=True)[:5]
    
    # Convert the flat indices back to (b, t) indices
    top_b, top_t = torch.div(top_indices, token_length, rounding_mode='floor'), top_indices % token_length

    # Get the corresponding top values
    top_values = abs_vals[top_b, top_t]

    print(f'\n\n---- Feature {i} ---- Activation: {frac_activated[i]:.3f}')
    for j in range(len(top_indices)):
        sample_idx = top_b[j].item()
        token_idx = top_t[j].item()
        tokens = X_ordered[sample_idx]
        tokens = torch.cat([tokens[:token_idx], torch.Tensor(bold_idx), tokens[token_idx:]])
        tokens = torch.cat([tokens[:(token_idx+2)], torch.Tensor(bold_idx), tokens[(token_idx+2):]])
        sentence = tokenizer.decode(tokens.long())
        sentence = sentence.replace('\n', '=newline=')
        print(sentence, '-->', f'{top_values[j].item():.2f}')




---- Feature 0 ---- Activation: 0.181
 named Tim went to play*** with*** his friend --> 47.54
. He loved to play*** with*** his red --> 44.67
 played*** with*** the orange ball. But then --> 42.98
 and Lily played*** with*** the skull every day --> 42.53
 score. Sam was happy and*** not*** angry --> 39.25


---- Feature 2 ---- Activation: 0.356
. Mia was*** so*** tired that she closed --> 72.02
 anymore.=newline==newline="***Good*** job, --> 69.46
Yes, I want*** to*** play, but --> 62.24
 was happy*** to*** help.=newline==newline=As --> 60.90
 shiny rock! Timmy*** was*** so happy --> 60.74


---- Feature 4 ---- Activation: 0.412
 a time***,*** there was a big, --> 71.37
Spot. Spot saw the shiny car*** and*** --> 69.25
 asked his friends to*** help*** him.=newline= --> 64.56
.=newline==newline=***One*** day, they found --> 63.89
Once upon a time, there*** was*** a --> 63.73


---- Feature 5 ---- Activation: 0.173
 there was a little girl named Lily***.*** --> 45.24
, they were happy t

In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

model = AutoModelForCausalLM.from_pretrained('roneneldan/TinyStories-1M')
for n,p in model.named_parameters(): print(n, p.shape, p.numel())

  return self.fget.__get__(instance, owner)()


transformer.wte.weight torch.Size([50257, 64]) 3216448
transformer.wpe.weight torch.Size([2048, 64]) 131072
transformer.h.0.ln_1.weight torch.Size([64]) 64
transformer.h.0.ln_1.bias torch.Size([64]) 64
transformer.h.0.attn.attention.k_proj.weight torch.Size([64, 64]) 4096
transformer.h.0.attn.attention.v_proj.weight torch.Size([64, 64]) 4096
transformer.h.0.attn.attention.q_proj.weight torch.Size([64, 64]) 4096
transformer.h.0.attn.attention.out_proj.weight torch.Size([64, 64]) 4096
transformer.h.0.attn.attention.out_proj.bias torch.Size([64]) 64
transformer.h.0.ln_2.weight torch.Size([64]) 64
transformer.h.0.ln_2.bias torch.Size([64]) 64
transformer.h.0.mlp.c_fc.weight torch.Size([256, 64]) 16384
transformer.h.0.mlp.c_fc.bias torch.Size([256]) 256
transformer.h.0.mlp.c_proj.weight torch.Size([64, 256]) 16384
transformer.h.0.mlp.c_proj.bias torch.Size([64]) 64
transformer.h.1.ln_1.weight torch.Size([64]) 64
transformer.h.1.ln_1.bias torch.Size([64]) 64
transformer.h.1.attn.attention.k_