# Transformer






In [69]:
import sys
import os 
import gc

# Add the test directory to sys.path
parent_dir = os.path.expanduser('..')
sys.path.append(parent_dir)



# Standard library imports
import importlib
import gc
import copy

# Third-party imports
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import einops
import matplotlib.pyplot as plt
from transformers import GPT2Model, GPT2Config, GPT2Tokenizer
from datasets import load_dataset
from transformer_lens.utils import tokenize_and_concatenate



# Reload modules using importlib
importlib.reload(importlib.import_module('eigenestimation.eigenhora'))
importlib.reload(importlib.import_module('eigenestimation.loss'))
importlib.reload(importlib.import_module('eigenestimation.train'))
importlib.reload(importlib.import_module('evaluation.activating_examples'))
importlib.reload(importlib.import_module('toy_models.transformer_wrapper'))
importlib.reload(importlib.import_module('eigenestimation.utils'))
importlib.reload(importlib.import_module('evaluation.transformers'))



from eigenestimation.eigenhora import EigenHora
from eigenestimation import loss
from eigenestimation.train import Train
from evaluation.activating_examples import DrawNeuralNetwork
from evaluation.transformers import PrintTopActivatingTexts, PrintTopActivatingTextsAllFeatures
from toy_models import transformer_wrapper
from eigenestimation.utils import TransformDataLoader, DeleteParams

device = 'cuda'

## Set up

In [2]:
# @title Import pretrained gpt2 (2 layers)
# Disable fused kernels (FlashAttention and memory-efficient attention)
# We have to disable this to compute second-order gradients on transformer models.
from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2LMHeadModel
import transformer_lens
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)

# Ensure the math kernel is enabled (it is True by default)
torch.backends.cuda.enable_math_sdp(True)

# Load in a 2-L GPT2.
#gpt2 = GPT2Model.from_pretrained('gpt2', config=config)
#tokenizer = GPT2Tokenizer.from_pretrained("gpt2",)
#tokenizer.pad_token = tokenizer.eos_token
#transformer_model = TransformerWrapper(gpt2, tokenizer)


#gpt2  = transformer_lens.HookedTransformer.from_pretrained('gpt2-small')
#tokenizer = gpt2.tokenizer

tinystories_1m  = transformer_lens.HookedTransformer.from_pretrained("roneneldan/TinyStories-1M")#
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
tokenizer.pad_token = tokenizer.eos_token
transformer_model0 = transformer_wrapper.TransformerWrapper(tinystories_1m, tokenizer)


tinystories_33m  = transformer_lens.HookedTransformer.from_pretrained("roneneldan/TinyStories-33M")#
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
tokenizer.pad_token = tokenizer.eos_token
transformer_model = transformer_wrapper.TransformerWrapper(tinystories_33m, tokenizer)


print( [(name, param.numel()) for name, param in transformer_model.named_parameters()])
#transformer_model0  = transformer_lens.HookedTransformer.from_pretrained("roneneldan/TinyStories-2Layers-33M")#
#tokenizer0 = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
##tokenizer.pad_token = tokenizer.eos_token
#transformer_model0 = TransformerWrapper(transformer_model0, tokenizer0).requires_grad_(False)


# Make the eigenestimation a little smaller but only looking at a subset of the parameters.
# Pick a random subset of tensors to include in paramters, and turn the rest into frozen buffers.
params_to_delete = [name for name, param in transformer_model.named_parameters()]
params_to_delete = [p for p in params_to_delete if #('blocks.4.attn.W' not in p)]# and ('blocks.6.mlp.W' not in p)]#!='transformer.h.1.ln_2.weight']
   'transformer.blocks.3.attn.W_K' not in p]#!='transformer.h.1.ln_2.weight']

# Delete 3/4 of the parameters.
#for p in (params_to_delete[::20]):
#  params_to_delete.remove(p)

DeleteParams(transformer_model, params_to_delete)

print(sum([p.numel() for p in transformer_model.parameters()]))
for n,p in transformer_model.named_parameters(): print(n, p.shape, p.numel())

# Load in data.
dataset = load_dataset('roneneldan/TinyStories', split="validation[:1%]")
X_transformer = tokenize_and_concatenate(dataset, transformer_model.tokenizer, max_length = 8, add_bos_token=False)['tokens']
print(X_transformer.shape)

Loaded pretrained model roneneldan/TinyStories-1M into HookedTransformer


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loaded pretrained model roneneldan/TinyStories-33M into HookedTransformer
[('transformer.embed.W_E', 38597376), ('transformer.pos_embed.W_pos', 1572864), ('transformer.blocks.0.attn.W_Q', 589824), ('transformer.blocks.0.attn.W_O', 589824), ('transformer.blocks.0.attn.b_Q', 768), ('transformer.blocks.0.attn.b_O', 768), ('transformer.blocks.0.attn.W_K', 589824), ('transformer.blocks.0.attn.W_V', 589824), ('transformer.blocks.0.attn.b_K', 768), ('transformer.blocks.0.attn.b_V', 768), ('transformer.blocks.0.mlp.W_in', 2359296), ('transformer.blocks.0.mlp.b_in', 3072), ('transformer.blocks.0.mlp.W_out', 2359296), ('transformer.blocks.0.mlp.b_out', 768), ('transformer.blocks.1.attn.W_Q', 589824), ('transformer.blocks.1.attn.W_O', 589824), ('transformer.blocks.1.attn.b_Q', 768), ('transformer.blocks.1.attn.b_O', 768), ('transformer.blocks.1.attn.W_K', 589824), ('transformer.blocks.1.attn.W_V', 589824), ('transformer.blocks.1.attn.b_K', 768), ('transformer.blocks.1.attn.b_V', 768), ('transform

## Eigenestimation

In [72]:
gc.collect()
torch.cuda.empty_cache()

def transformer_model0(y):
    return torch.ones_like(y).softmax(dim=-1)
shora_features = 100
hora_rank = 1
eigenmodel = EigenHora(transformer_model, transformer_model0, loss.KLDivergenceLoss(), hora_features, hora_rank, device=device).to(device)
dataloader = TransformDataLoader(X_transformer[::10,:4], batch_size=8, transform_fn=eigenmodel.compute_jacobian)
Train(eigenmodel, dataloader, lr=.01, n_epochs=100, L0_penalty=.001, device=device)

Epoch 0 : 36.538,  Reconstruction Loss: 36.538,  Sparsity Loss: 0.000
Epoch 1 : 36.957,  Reconstruction Loss: 36.957,  Sparsity Loss: 0.000
Epoch 2 : 37.029,  Reconstruction Loss: 37.029,  Sparsity Loss: 0.000
Epoch 3 : 36.684,  Reconstruction Loss: 36.684,  Sparsity Loss: 0.000
Epoch 4 : 36.863,  Reconstruction Loss: 36.863,  Sparsity Loss: 0.000
Epoch 5 : 37.081,  Reconstruction Loss: 37.081,  Sparsity Loss: 0.000
Epoch 6 : 36.595,  Reconstruction Loss: 36.595,  Sparsity Loss: 0.000
Epoch 7 : 36.586,  Reconstruction Loss: 36.586,  Sparsity Loss: 0.000
Epoch 8 : 36.655,  Reconstruction Loss: 36.655,  Sparsity Loss: 0.000
Epoch 9 : 36.779,  Reconstruction Loss: 36.779,  Sparsity Loss: 0.000
Epoch 10 : 36.750,  Reconstruction Loss: 36.750,  Sparsity Loss: 0.000
Epoch 11 : 36.463,  Reconstruction Loss: 36.463,  Sparsity Loss: 0.080
Epoch 12 : 36.269,  Reconstruction Loss: 36.269,  Sparsity Loss: 0.308
Epoch 13 : 36.171,  Reconstruction Loss: 36.170,  Sparsity Loss: 0.333
Epoch 14 : 36.14

KeyboardInterrupt: 

In [73]:
top_texts = PrintTopActivatingTextsAllFeatures(eigenmodel, dataloader, top_n=5)

------feature 0-------
Tom tried* t*o -> 7.712070271281846e-08
 and tried*,* but -> 5.0858769640171886e-08
Tom* tried* to -> 3.895867450864898e-08
 lots of* be*akers -> 2.8452637579334805e-08
They* tried* to roll -> 2.6440616807121842e-08
------feature 1-------
 They knew that* playing* -> 1.3160922872346159e-11
 soap* on* his face -> 1.0703455483040969e-11
Tom tried* t*o -> 1.0185556391373307e-11
 soap on* his* face -> 9.219998028942022e-12
 soap on his* face* -> 7.127769728609845e-12
------feature 2-------
 has* long* ears and -> 3.179251653606663e-11
 still loved her*,* -> 2.8877197508214714e-11
 big tank that* could* -> 2.8161643156332516e-11
 has long ears* and* -> 2.126722409290238e-11
 I love* you*, -> 1.6956847384563822e-11
------feature 3-------
 "I promise* to* -> 1.4351831014574907e-22
 he liked to* draw* -> 1.2611533175720248e-22
 mov*in*g again -> 1.1582816903456253e-22
 takes it out* of* -> 1.038383983538532e-22
 put the* picture* in -> 1.0322939142413534e-22
------featur