In [8]:
import os
import sys
import plotly.express as px
import torch as t
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
import numpy as np
import einops
from jaxtyping import Int, Float
from typing import List, Optional, Tuple
import functools
from tqdm import tqdm
from IPython.display import display
import webbrowser
import gdown
from transformer_lens.hook_points import HookPoint
from transformer_lens import utils, HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache
import circuitsvis as cv

# Make sure exercises are in the path
chapter = r"chapter1_transformers"
exercises_dir = Path(f"{os.getcwd().split(chapter)[0]}/{chapter}/exercises").resolve()
section_dir = (exercises_dir / "part2_intro_to_mech_interp").resolve()
if str(exercises_dir) not in sys.path: sys.path.append(str(exercises_dir))

from plotly_utils import imshow, hist, plot_comp_scores, plot_logit_attribution, plot_loss_difference
from part1_transformer_from_scratch.solutions import get_log_probs
import part2_intro_to_mech_interp.tests as tests

# Saves computation time, since we don't need it for the contents of this notebook
t.set_grad_enabled(False)

device = t.device("cuda" if t.cuda.is_available() else "cpu")

MAIN = __name__ == "__main__"

In [9]:
if MAIN:
    gpt2_small: HookedTransformer = HookedTransformer.from_pretrained("gpt2-small")

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2-small into HookedTransformer


In [10]:
if MAIN:
    model_description_text = '''## Loading Models

    HookedTransformer comes loaded with >40 open source GPT-style models. You can load any of them in with `HookedTransformer.from_pretrained(MODEL_NAME)`. Each model is loaded into the consistent HookedTransformer architecture, designed to be clean, consistent and interpretability-friendly. 

    For this demo notebook we'll look at GPT-2 Small, an 80M parameter model. To try the model the model out, let's find the loss on this paragraph!'''

    loss = gpt2_small(model_description_text, return_type="loss")
    print("Model loss:", loss)

Model loss: tensor(4.6055, device='cuda:0')


In [11]:
if MAIN:
    logits: Tensor = gpt2_small(model_description_text, return_type="logits")
    prediction = logits.argmax(dim=-1).squeeze()[:-1]
    idx = gpt2_small.to_tokens(model_description_text, prepend_bos=False) == prediction
    acc = t.sum(idx) / len(prediction)
    correct_tokens = gpt2_small.to_tokens(model_description_text, prepend_bos=False)[idx]
    correct_words = gpt2_small.to_string(correct_tokens)
    print(acc, correct_words)
    
    # YOUR CODE HERE - get the model's prediction on the text

tensor(0.2586, device='cuda:0')   with models. can ofTransformer._NAME model loaded theedTransformer to be and-.  atPT-,,'s the


In [12]:
    
if MAIN:
    gpt2_text = "Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on taskspecific datasets."
    gpt2_tokens = gpt2_small.to_tokens(gpt2_text)
    gpt2_logits, model = gpt2_small.run_with_cache(gpt2_tokens, remove_batch_dim=True)

In [13]:
if MAIN:
    layer0_pattern_from_cache = gpt2_cache["pattern", 0]
    keys = gpt2_cache["k",0]
    query = gpt2_cache["q", 0]
    pattern = gpt2_cache["pattern", 0]
    
    attn_scores = einops.einsum(keys, query, "... sk n h, ... sq n h -> ... n sq sk") / (gpt2_small.cfg.d_head **0.5)
    
    n, sq, sk = attn_scores.shape[-3:]
    mask = t.tril(t.ones(sq,sk)).to(device)
    causal_attn_scores = mask * attn_scores + (1 - mask) * -1e5
    layer0_pattern_from_q_and_k = t.softmax(causal_attn_scores, dim=-1)
  
    
    
    # YOUR CODE HERE - define `layer0_pattern_from_q_and_k` manually, by manually performing the steps of the attention calculation (dot product, masking, scaling, softmax)
    t.testing.assert_close(layer0_pattern_from_cache, layer0_pattern_from_q_and_k)
    print("Tests passed!")

NameError: name 'gpt2_cache' is not defined

In [None]:

if MAIN:
    print(type(gpt2_cache))
    attention_pattern = gpt2_cache["pattern", 0, "attn"]
    print(attention_pattern.shape)
    gpt2_str_tokens = gpt2_small.to_str_tokens(gpt2_text)

    print("Layer 0 Head Attention Patterns:")
    display(cv.attention.attention_patterns(
        tokens=gpt2_str_tokens, 
        attention=attention_pattern,
        attention_head_names=[f"L0H{i}" for i in range(12)],
    ))
    

In [24]:
if MAIN:
    cfg = HookedTransformerConfig(
        d_model=768,
        d_head=64,
        n_heads=12,
        n_layers=2,
        n_ctx=2048,
        d_vocab=50278,
        attention_dir="causal",
        attn_only=True, # defaults to False
        tokenizer_name="EleutherAI/gpt-neox-20b", 
        seed=398,
        use_attn_result=True,
        normalization_type=None, # defaults to "LN", i.e. layernorm with weights & biases
        positional_embedding_type="shortformer"
    )
    
if MAIN:
    weights_dir = (section_dir / "attn_only_2L_half.pth").resolve()

    if not weights_dir.exists():
        url = "https://drive.google.com/uc?id=1vcZLJnJoYKQs-2KOjkd6LvHZrkSdoxhu"
        output = str(weights_dir)
        gdown.download(url, output)
        
if MAIN:
    model = HookedTransformer(cfg)
    pretrained_weights = t.load(weights_dir, map_location=device)
    model.load_state_dict(pretrained_weights)

if MAIN:
    #text = "We think that powerful, significantly superhuman machine intelligence is more likely than not to be created this century. If current machine learning techniques were scaled up to this level, we think they would by default produce systems that are deceptive or manipulative, and that no solid plans are known for how to avoid this."
    text = "We the People of the United States, in Order to form a more perfect Union, establish Justice, insure domestic Tranquility, provide for the common defence,[note 1] promote the general Welfare, and secure the Blessings of Liberty to ourselves and our Posterity, do ordain and establish this Constitution for the United States of America."
    logits, cache = model.run_with_cache(text, remove_batch_dim=True)
    

    # YOUR CODE HERE - visualize attention

Using pad_token, but it is not set yet.


In [29]:
if MAIN:
    for layer in [0, 1]:
        print(type(cache))
        attention_pattern = cache["pattern", layer]
        print(attention_pattern.shape)
        model_str_tokens = model.to_str_tokens(text)

        print(f"Layer {layer} Head Attention Patterns:")
        display(cv.attention.attention_patterns(
            tokens=model_str_tokens, 
            attention=attention_pattern,
            attention_head_names=[f"L{layer}H{i}" for i in range(12)],))

<class 'transformer_lens.ActivationCache.ActivationCache'>
torch.Size([12, 71, 71])
Layer 0 Head Attention Patterns:


<class 'transformer_lens.ActivationCache.ActivationCache'>
torch.Size([12, 71, 71])
Layer 1 Head Attention Patterns:


In [26]:
def detector(cache, target) -> List[str]:
    n, sq, sk = cache["pattern", 0].shape
    locations = [f"{x}.{y}" for x in [0,1] for y in range(12)]
    mask = t.tril(t.ones(sq,sk)).to(device)
    target = target * mask
    
    dists = []
    for layer in [0, 1]:
        attention_patterns = cache["pattern", layer]
        for pat_idx,  pattern in enumerate(attention_patterns):
            pattern = pattern * mask
            dist = t.norm(target - pattern)
            dists.append(dist)
            
    val, idx = t.sort(t.tensor(dists))
    return [locations[i] for i in idx[:3]]    

def current_attn_detector(cache: ActivationCache) -> List[str]:
    '''
    Returns a list e.g. ["0.2", "1.4", "1.9"] of "layer.head" which you judge to be current-token heads
    '''
    n, sq, sk = cache["pattern", 0].shape
    target = t.eye(sq).to(device)
    return detector(cache, target)

def prev_attn_detector(cache: ActivationCache) -> List[str]:
    '''
    Returns a list e.g. ["0.2", "1.4", "1.9"] of "layer.head" which you judge to be prev-token heads
    '''
    n, sq, sk = cache["pattern", 0].shape
    target = t.zeros((sq,sk))
    idx = t.arange(sq-1)
    target[idx+1, idx] = 1
    target = target.to(device)
    return detector(cache, target)
    

def first_attn_detector(cache: ActivationCache) -> List[str]:
    '''
    Returns a list e.g. ["0.2", "1.4", "1.9"] of "layer.head" which you judge to be first-token heads
    '''
    n, sq, sk = cache["pattern", 0].shape
    target = t.zeros((sq,sk)).to(device)
    target[:,0]= 1
    return detector(cache, target)


if MAIN:
    print("Heads attending to current token  = ", ", ".join(current_attn_detector(cache)))
    print("Heads attending to previous token = ", ", ".join(prev_attn_detector(cache)))
    print("Heads attending to first token    = ", ", ".join(first_attn_detector(cache)))

Heads attending to current token  =  0.11, 0.9, 1.7
Heads attending to previous token =  0.7, 0.9, 0.4
Heads attending to first token    =  1.4, 0.3, 1.10


<function torch._VariableFunctionsClass.eye>