In [21]:
from transformer_lens import HookedTransformer
from einops import einsum
import torch
import pandas as pd

In [2]:
model = HookedTransformer.from_pretrained("tiny-stories-1M")

  return self.fget.__get__(instance, owner)()


Loaded pretrained model tiny-stories-1M into HookedTransformer


# RUNNING MODEL WITH ACTIVATIONCACHE #
# running with activations - returns activation values collected durning inference #

In [3]:
logits, cache = model.run_with_cache("Why did the chicken cross the")

In [4]:
print(logits.argmax(dim=-1).squeeze())
[model.to_string(x) for x in logits.argmax(dim=-1).squeeze()]

tensor([ 628,  750,  345, 1310,  467,   30, 2323], device='cuda:0')


['\n\n', ' did', ' you', ' little', ' go', '?', ' ground']

In [5]:
""" ActivationCache - dictionary of activation tensors of different layers from different blocks """ 

print(list(cache.keys())[:10])    # total keys(hooks) = approx 140
# print(cache.values())
# print(cache.items())   # (key, value) tuples
# print(cache.cache_dict["hook_embed"])   # cache dictionary (keys: values) "access individual activation this way!"

['hook_embed', 'hook_pos_embed', 'blocks.0.hook_resid_pre', 'blocks.0.ln1.hook_scale', 'blocks.0.ln1.hook_normalized', 'blocks.0.attn.hook_q', 'blocks.0.attn.hook_k', 'blocks.0.attn.hook_v', 'blocks.0.attn.hook_attn_scores', 'blocks.0.attn.hook_pattern']


# DECOMPOSING THE RESIDUAL STREAM #
# Decomposes the residual stream - returns the layer outputs that are being added into the residual stream #

In [6]:
decomposed_stream, lables = cache.decompose_resid(return_labels=True, mode="all")

In [7]:
print(decomposed_stream.size())   # (n_lables, batch_size, seq_len, d_model)

torch.Size([18, 1, 7, 64])


In [8]:
print(lables)

['embed', 'pos_embed', '0_attn_out', '0_mlp_out', '1_attn_out', '1_mlp_out', '2_attn_out', '2_mlp_out', '3_attn_out', '3_mlp_out', '4_attn_out', '4_mlp_out', '5_attn_out', '5_mlp_out', '6_attn_out', '6_mlp_out', '7_attn_out', '7_mlp_out']


In [9]:
""" 
Takes a residual stack (typically the residual stream decomposed by components), and calculates how
much each item in the stack "contributes" to specific tokens.

    It does this by:
        1. Getting the residual directions of the tokens (i.e. reversing the unembed)
        2. Taking the dot product of each item in the residual stack (activations), with the token residual
            directions.
            
    <<< Algorithm >>>
    
        logit_directions = tokens_to_residual_directions(tokens)   # map tokens to a tensor with the unembedding vector for those tokens.
        scaled_directions = layer_norm(logit_directions)
        layer_attrs = (scaled_directions * logit_direction).sum(dim=-1)
    
"""

assumed_prediction = " road"   # our token which is a "close possibility"
logit_attributions = cache.logit_attrs(decomposed_stream, assumed_prediction)   # calculating logit attributions

In [10]:
logit_attributions.size()  # (n_layers, batch_size, seq_len)

torch.Size([18, 1, 7])

In [11]:
logit_attributions.squeeze().sum(dim=1)   # attribution of each layer towards the assumed_prediction

tensor([ 49.0849,   6.0892, -21.5946,  49.0395, -37.6695,   9.1405, -49.2915,
          8.4667,   1.7180,  10.5615, -10.4780,  20.0837, -19.1105,  23.8754,
        -33.9309,  13.6613, -13.0773,  13.5248], device='cuda:0',
       grad_fn=<SumBackward1>)

In [12]:
most_imp_component_idx = logit_attributions.argmax()
print(lables[most_imp_component_idx])

1_mlp_out


In [13]:
full_resid, labels = cache.get_full_resid_decomposition(return_labels=True)   # for more granular decomposition

Tried to stack head results when they weren't cached. Computing head results now


# ACCUMULATING THE RESIDUAL STREAM #
# accumulating residual stream - returns residual stream after every block #

In [25]:
acc_resid, acc_labels = cache.accumulated_resid(layer=8, incl_mid=False, apply_ln=True, mlp_input=False, return_labels=True)
# useful parameters - layer(int), incl_mid(bool), apply_ln(bool), mlp_input(bool), return_labels(bool)
# incl_mid ==> returns "resid_mid" for all previous layers.
# mlp_input ==> whether to include "resid_mid" for the current layer.

In [26]:
print(acc_resid.size())
print(acc_labels)

torch.Size([9, 1, 7, 64])
['0_pre', '1_pre', '2_pre', '3_pre', '4_pre', '5_pre', '6_pre', '7_pre', 'final_post']


In [32]:
last_token_accum = acc_resid[:, 0, -1, :]   # accumulated stream values for the last token

In [33]:
print(last_token_accum.size())
print(model.W_U.size())

torch.Size([9, 64])
torch.Size([64, 50257])


In [30]:
layers_unembedded = einsum(
    last_token_accum,
    model.W_U,
    "layer d_model, d_model d_vocab -> layer d_vocab",
)

In [43]:
sorted_indices = torch.argsort(layers_unembedded, dim=1, descending=True)
rank_answer = (sorted_indices == model.to_single_token(" ground")).nonzero(as_tuple=True)[1]

In [45]:
rank_answer   # rank of last token at each layer

tensor([1999,  297,  124,  177,    9,   11,    3,   17,    2], device='cuda:0')

In [None]:
"""
CLASS:
class transformer_lens.ActivationCache.ActivationCache(cache_dict: Dict[str, Tensor], model, has_batch_dim: bool = True)

METHODS:
accumulated_resid
apply_ln_to_stack
apply_slice_to_batch_dim
compute_head_results
decompose_resid
get_full_resid_decomposition
get_neuron_results
items
keys
values
logit_attrs
remove_batch_dim
stack_activation
stack_head_results
stack_neuron_results
to
toggle_autodiff

"""