In [1]:
import torch
import ipywidgets as widgets
import transformer_lens
import activation_analysis
import model_loader
import forward_pass
from forward_pass import list_activation_tensors, list_weight_matrices
from activation_analysis import activation_agg_sim, compare_activation_similarity, display_activation_similarity_tables, display_activation_similarity_plots
from model_loader import load_model, print_available_models, gpu_mem_check,list_available_models
import math
import re
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# SELECT MODEL #
model_dropdown = widgets.Dropdown(
    options=list_available_models(),
    value=list_available_models()[0],  # Set default to first model
    description='Select Model:',
    disabled=False,
)
display(model_dropdown)

Dropdown(description='Select Model:', options=('gpt2-small', 'gpt2-medium', 'Qwen 2-0.5B', 'Qwen 2-0.5B-Instru…

In [2]:
# Load Model #
model_name = model_dropdown.value
model = load_model(model_name)
gpu_mem_check()

Loading model gpt2-medium from gpt2-medium...




Loaded pretrained model gpt2-medium into HookedTransformer
Model loaded successfully on cuda.
Total GPU Memory: 11.99 GB
Allocated GPU Memory: 1.54 GB
Cached GPU Memory: 1.55 GB


In [9]:
prompt = "Hello world"
list_activation_tensors(model, prompt)

#example tensor call
logits, cache = model.run_with_cache(prompt) 
print("\nExample tensor call:")
cache['blocks.23.hook_resid_post'][0]

Available activation tensors:
hook_embed                       torch.Size([3, 1024])
hook_pos_embed                   torch.Size([3, 1024])
blocks.n.hook_resid_pre          torch.Size([3, 1024])
blocks.n.ln1.hook_scale          torch.Size([3, 1])
blocks.n.ln1.hook_normalized     torch.Size([3, 1024])
blocks.n.attn.hook_q             torch.Size([3, 16, 64])
blocks.n.attn.hook_k             torch.Size([3, 16, 64])
blocks.n.attn.hook_v             torch.Size([3, 16, 64])
blocks.n.attn.hook_attn_scores   torch.Size([16, 3, 3])
blocks.n.attn.hook_pattern       torch.Size([16, 3, 3])
blocks.n.attn.hook_z             torch.Size([3, 16, 64])
blocks.n.hook_attn_out           torch.Size([3, 1024])
blocks.n.hook_resid_mid          torch.Size([3, 1024])
blocks.n.ln2.hook_scale          torch.Size([3, 1])
blocks.n.ln2.hook_normalized     torch.Size([3, 1024])
blocks.n.mlp.hook_pre            torch.Size([3, 4096])
blocks.n.mlp.hook_post           torch.Size([3, 4096])
blocks.n.hook_mlp_out          

tensor([  4.0842,   4.6890,   0.9476,  ...,  -1.1796, -10.8282,   3.6351],
       device='cuda:0')

In [7]:
list_weight_matrices(model)

#example tensor call
print("\nExample tensor call:")
model.blocks[23].mlp.W_out

Model state_dict keys:
embed.W_E               torch.Size([50257, 1024])
pos_embed.W_pos         torch.Size([1024, 1024])
blocks[n].ln1.w         torch.Size([1024])
blocks[n].ln1.b         torch.Size([1024])
blocks[n].ln2.w         torch.Size([1024])
blocks[n].ln2.b         torch.Size([1024])
blocks[n].attn.W_Q      torch.Size([16, 1024, 64])
blocks[n].attn.W_O      torch.Size([16, 64, 1024])
blocks[n].attn.b_Q      torch.Size([16, 64])
blocks[n].attn.b_O      torch.Size([1024])
blocks[n].attn.W_K      torch.Size([16, 1024, 64])
blocks[n].attn.W_V      torch.Size([16, 1024, 64])
blocks[n].attn.b_K      torch.Size([16, 64])
blocks[n].attn.b_V      torch.Size([16, 64])
blocks[n].attn.mask     torch.Size([1024, 1024])
blocks[n].attn.IGNORE   torch.Size([])
blocks[n].mlp.W_in      torch.Size([1024, 4096])
blocks[n].mlp.b_in      torch.Size([4096])
blocks[n].mlp.W_out     torch.Size([4096, 1024])
blocks[n].mlp.b_out     torch.Size([1024])
ln_final.w              torch.Size([1024])
ln_final.

Parameter containing:
tensor([[ 0.3126, -0.1634,  0.1243,  ...,  0.0084,  0.0499,  0.2266],
        [ 0.2040, -0.0206,  0.0207,  ...,  0.1752,  0.2064, -0.0457],
        [-0.0872,  0.0126,  0.2485,  ...,  0.1453, -0.1659, -0.1803],
        ...,
        [ 0.0513, -0.0420, -0.2821,  ...,  0.3503,  0.2665, -0.0642],
        [ 0.1500,  0.0376,  0.0267,  ..., -0.0566, -0.1505,  0.1074],
        [-0.0872, -0.0176,  0.0117,  ...,  0.4246,  0.0556,  0.0542]],
       device='cuda:0', requires_grad=True)

In [78]:
# Example of testing cosine similarity between two activation vectors
act1 = cache['blocks.5.hook_resid_post'][0]
act2 = cache['blocks.23.hook_resid_post'][0]
final_row_act1 = act1[-1, :]
final_row_act2 = act2[-1, :]

# Calculate cosine similarity between final_row_act1 and final_row_act2
F.cosine_similarity(final_row_act1.unsqueeze(0), final_row_act2.unsqueeze(0))

tensor([0.2016], device='cuda:0')

#### 1.) We begin by specifying our prompt and retrieving the relevant tokens, subwords, and embeddings


In [10]:
prompt = "In the beginning, God created the"

# Run the prompt through the hooked transformer model #
logits, cache = model.run_with_cache(prompt)          # This stores the activations in a dictionary called cache
#######################################################

tokens = model.to_tokens(prompt)            #Extract token ids from input
subwords = model.to_str_tokens(tokens)      #Convert token ids to subwords

print("TokenID\tSubword")
for token, subword in zip(tokens.squeeze().tolist(), subwords):
    print(f"{token}\t{subword}")

TokenID	Subword
50256	<|endoftext|>
818	In
262	 the
3726	 beginning
11	,
1793	 God
2727	 created
262	 the


In [11]:
# We pull the word vectors from the embedding matrix by token id
# We then confirm that this is the same as the hook_embed tensor

input_vectors = model.W_E[tokens.squeeze()]
print(f"Shape of prompt word vectors: {input_vectors.shape}")

# Confirm that word_vectors is identical to cache['hook_embed']
print("\nIs input_vectors the same as hook_embed?", torch.allclose(input_vectors, cache['hook_embed'][0], atol=1e-6))

Shape of prompt word vectors: torch.Size([8, 1024])

Is input_vectors the same as hook_embed? True


#### 2.) Extract the first *n* position embeddings and **add** them to the respective token embeddings, producing *resid_pre*


In [12]:
# Positional embedding is model.W_pos -- model.W_E_pos is the W_E matrix concatenated with the W_pos matrix. idk its purpose.
# hook_pos_embed identical to W_pos. Again, idk why.

position_embeddings = model.W_pos[:len(tokens[0])]                   # Extract n vectors from the position embedding matrix when n = prompt length
input_vectors_with_pos = input_vectors + position_embeddings           # Add position embeddings to token embeddings elementwise

# Confirm that word_vectors_with_pos is identical to cache['blocks.0.hook_resid_pre']
print("\nIs input_vectors_with_pos the same as blocks.0.hook_resid_pre?", torch.allclose(input_vectors_with_pos, cache['blocks.0.hook_resid_pre'][0], atol=1e-5))


Is input_vectors_with_pos the same as blocks.0.hook_resid_pre? True


#### 3.) Layernorm 1

In [29]:
# 1. Normalize
mean = input_vectors_with_pos.mean(dim=-1, keepdim=True)
std = input_vectors_with_pos.std(dim=-1, keepdim=True)

# Confirm that std is identical to cache['blocks.0.ln1.hook_scale'] (I can't find ln1.hook_bias anywhere)
print("\nIs std the same as blocks.0.ln1.hook_scale?", torch.allclose(std, cache['blocks.0.ln1.hook_scale'][0], atol=1e-4))

normalized_resid_pre = (input_vectors_with_pos - mean) / cache['blocks.0.ln1.hook_scale'][0]

# 2. Apply weight (gamma) and bias (beta) (Even after normalizing, we still need to apply the learned layernorm weights)
ln1_output = normalized_resid_pre * model.blocks[0].ln1.w + model.blocks[0].ln1.b

# Compare with the output of the first layer norm in the model
print("\nIs ln1_output the same as blocks.0.ln1.hook_normalized?", torch.allclose(ln1_output, cache['blocks.0.ln1.hook_normalized'][0], atol=1e-4))



Is std the same as blocks.0.ln1.hook_scale? True

Is ln1_output the same as blocks.0.ln1.hook_normalized? True


In [30]:
# Initialize an empty list to store the results
Query_heads = []
Key_heads = []
Value_heads = []

# Multiply ln1_output by each of the heads in W_Q, W_K, and W_V
for i in range(model.cfg.n_heads):
    Query_head = ln1_output @ model.blocks[0].attn.W_Q[i] + model.blocks[0].attn.b_Q[i]
    Query_heads.append(Query_head)
    Key_head = ln1_output @ model.blocks[0].attn.W_K[i] + model.blocks[0].attn.b_K[i]
    Key_heads.append(Key_head)
    Value_head = ln1_output @ model.blocks[0].attn.W_V[i] + model.blocks[0].attn.b_V[i]
    Value_heads.append(Value_head)

# Stack results into single tensors
Q_stacked_heads = torch.stack(Query_heads)
Q_heads_aligned = Q_stacked_heads.permute(1, 0, 2)          # Reshape to (seq_len, num_heads, head_dim)
K_stacked_heads = torch.stack(Key_heads)
K_heads_aligned = K_stacked_heads.permute(1, 0, 2)          # Reshape to (seq_len, num_heads, head_dim)
V_stacked_heads = torch.stack(Value_heads)
V_heads_aligned = V_stacked_heads.permute(1, 0, 2)          # Reshape to (seq_len, num_heads, head_dim)

print("\nIs Q_heads_aligned the same as blocks.0.attn.hook_q?", torch.allclose(Q_heads_aligned, cache['blocks.0.attn.hook_q'][0], atol=1e-4))
print("\nIs K_heads_aligned the same as blocks.0.attn.hook_k?", torch.allclose(K_heads_aligned, cache['blocks.0.attn.hook_k'][0], atol=1e-4))
print("\nIs V_heads_aligned the same as blocks.0.attn.hook_v?", torch.allclose(V_heads_aligned, cache['blocks.0.attn.hook_v'][0], atol=1e-4))



Is Q_heads_aligned the same as blocks.0.attn.hook_q? True

Is K_heads_aligned the same as blocks.0.attn.hook_k? True

Is V_heads_aligned the same as blocks.0.attn.hook_v? True


In [31]:
attention_heads = []
# Compute attention scores
for i in range(model.cfg.n_heads):
    attention_head = Query_heads[i] @ Key_heads[i].transpose(0, 1)
    attention_heads.append(attention_head)

attention_scores = torch.stack(attention_heads)
d_k = model.cfg.d_head
attention_scores_scaled = attention_scores / math.sqrt(d_k)

# Create causal mask
seq_len = attention_scores_scaled.size(-1)
causal_mask = torch.triu(torch.ones(seq_len, seq_len, device=attention_scores_scaled.device), diagonal=1).bool()

# Apply causal mask
attention_scores_masked = attention_scores_scaled.masked_fill(causal_mask, float('-inf'))

print("\nShape of attention_scores_masked:", attention_scores_masked.shape)
print("\nIs attention_scores_masked the same as blocks.0.attn.hook_attn_scores?", torch.allclose(attention_scores_masked, cache['blocks.0.attn.hook_attn_scores'][0], atol=1e-4))


Shape of attention_scores_masked: torch.Size([16, 8, 8])

Is attention_scores_masked the same as blocks.0.attn.hook_attn_scores? True


In [32]:
# Apply softmax to get attention weights
attention_weights = F.softmax(attention_scores_masked, dim=-1)

print("\nShape of attention_weights:", attention_weights.shape)
print("\nIs attention_weights the same as blocks.0.attn.hook_pattern?", torch.allclose(attention_weights, cache['blocks.0.attn.hook_pattern'][0], atol=1e-4))


Shape of attention_weights: torch.Size([16, 8, 8])

Is attention_weights the same as blocks.0.attn.hook_pattern? True


In [33]:
weighted_values = []

for i in range(model.cfg.n_heads):
    weighted_values.append(attention_weights[i] @ Value_heads[i])

weighted_values_stacked = torch.stack(weighted_values)
weighted_values_final = weighted_values_stacked.permute(1, 0, 2)

print("\nIs weighted_values_final the same as blocks.0.attn.hook_z?", torch.allclose(weighted_values_final, cache['blocks.0.attn.hook_z'][0], atol=1e-4))


Is weighted_values_final the same as blocks.0.attn.hook_z? True


In [34]:
attention_output_heads = []
for i in range(model.cfg.n_heads):
    attention_output_heads.append(weighted_values_stacked[i] @ model.W_O[0][i])

attention_outputs= torch.stack(attention_output_heads)              # Attention outputs are the weighted Values multiplied by the W_O weights

attention_out_summed = torch.sum(attention_outputs, dim=0)          # Sum the attention outputs across heads
attention_outputs_final  = attention_out_summed + model.b_O[0]      # Add the bias to the summed attention outputs

print("\nIs attention_outputs_final the same as blocks.0.hook_attn_out?", torch.allclose(attention_outputs_final, cache['blocks.0.hook_attn_out'][0], atol=1e-4))


Is attention_outputs_final the same as blocks.0.hook_attn_out? True


In [35]:
resid_mid = cache['blocks.0.hook_resid_pre'][0] + attention_outputs_final
print("\nIs resid_mid the same as blocks.0.hook_resid_mid?", torch.allclose(resid_mid, cache['blocks.0.hook_resid_mid'][0], atol=1e-4))


Is resid_mid the same as blocks.0.hook_resid_mid? True


In [36]:
# 1. Normalize
mean = resid_mid.mean(dim=-1, keepdim=True)
std = resid_mid.std(dim=-1, keepdim=True)

# Confirm that std is identical to cache['blocks.0.ln2.hook_scale'] (I can't find ln1.hook_bias anywhere)
print("\nIs std the same as blocks.0.ln2.hook_scale?", torch.allclose(std, cache['blocks.0.ln2.hook_scale'][0], atol=1e-4))

normalized_resid_mid = (resid_mid - mean) / cache['blocks.0.ln2.hook_scale'][0]

# 2. Apply weight (gamma) and bias (beta) (Even after normalizing, we still need to apply the learned layernorm weights)
ln2_output = normalized_resid_mid * model.blocks[0].ln2.w + model.blocks[0].ln2.b

# Compare with the output of the first layer norm in the model
print("\nIs ln2_output the same as blocks.0.ln2.hook_normalized?", torch.allclose(ln2_output, cache['blocks.0.ln2.hook_normalized'][0], atol=1e-4))


Is std the same as blocks.0.ln2.hook_scale? True

Is ln2_output the same as blocks.0.ln2.hook_normalized? True


In [37]:
mlp_in = ln2_output @ model.W_in[0] + model.b_in[0]

print("\nIs mlp_in the same as blocks.0.mlp.hook_pre?", torch.allclose(mlp_in, cache['blocks.0.mlp.hook_pre'][0], atol=1e-4))


Is mlp_in the same as blocks.0.mlp.hook_pre? True


In [38]:
# Apply GELU activation function
mlp_gelu = torch.nn.functional.gelu(mlp_in)

print("Is mlp_gelu the same as blocks.0.mlp.hook_post?", torch.allclose(mlp_gelu, cache['blocks.0.mlp.hook_post'][0], atol=1e-3))

Is mlp_gelu the same as blocks.0.mlp.hook_post? True


In [39]:
mlp_out = cache['blocks.0.mlp.hook_post'][0] @ model.W_out[0] + model.b_out[0]

print("\nIs mlp_out the same as blocks.0.hook_mlp_out?", torch.allclose(mlp_out, cache['blocks.0.hook_mlp_out'][0], atol=1e-4))


Is mlp_out the same as blocks.0.hook_mlp_out? True


In [40]:
resid_post = resid_mid + mlp_out
print("\nIs resid_post the same as blocks.0.hook_resid_post?", torch.allclose(resid_post, cache['blocks.0.hook_resid_post'][0], atol=1e-4))
print("\nIs resid_post_0 the same as resid_pre_1?", torch.allclose(cache['blocks.0.hook_resid_post'][0],cache['blocks.1.hook_resid_pre'][0], atol=1e-4))


Is resid_post the same as blocks.0.hook_resid_post? True

Is resid_post_0 the same as resid_pre_1? True


In [65]:
# Get the final residual stream output
final_residual = cache['blocks.23.hook_resid_post'][0]  # Assuming 24 layers (0-23)

# Apply final layer normalization
final_ln_mean = final_residual.mean(dim=-1, keepdim=True)
final_ln_std = cache['ln_final.hook_scale'][0]  ## ln2.hook_scale is sd of resid_post
final_ln_normalized = (final_residual - final_ln_mean) / final_ln_std

# Apply learned scale and bias of the final layer norm
final_ln_output = final_ln_normalized * model.ln_final.w + model.ln_final.b

# Check if final_ln_output is equivalent to cache['ln_final.hook_normalized']
is_equivalent = torch.allclose(final_ln_output, cache['ln_final.hook_normalized'][0], atol=1e-4)
print("\nIs final_ln_output equivalent to cache['ln_final.hook_normalized']?", is_equivalent)

# Apply the unembedding (usually just a linear transformation)
manual_logits = final_ln_output @ model.W_U + model.b_U   # for gpt2-med b_U is a vector of 0s
print("\nIs manual_logits equivalent to logits?", torch.allclose(manual_logits, logits, atol=1e-4))

probs = F.softmax(manual_logits, dim=-1)


Is final_ln_output equivalent to cache['ln_final.hook_normalized']? True

Is manual_logits equivalent to logits? True


In [58]:
# If you want to get the prediction for the next token after the sequence:
next_token_probs = probs[-1, :]  # Shape: (batch_size, vocab_size)

# Get the top 5 probabilities and their indices
top_5_probs, top_5_indices = torch.topk(next_token_probs, k=5)

# Convert probabilities to percentages
top_5_percentages = top_5_probs * 100

# Create a list of tuples (word, probability)
top_5_words_and_probs = [(model.to_string(idx.item()), prob.item()) for idx, prob in zip(top_5_indices, top_5_percentages)]

# Print the table
print("Top 5 most likely next words:")
print("-----------------------------")
print("Word\t\tProbability")
print("-----------------------------")
for word, prob in top_5_words_and_probs:
    print(f"{word:<15}{prob:.2f}%")

Top 5 most likely next words:
-----------------------------
Word		Probability
-----------------------------
 heavens       28.98%
 world         13.31%
 earth         10.56%
 universe      5.70%
 heaven        4.76%
