# Test `<BOS>`

In [62]:
from transformer_lens.cautils.notebook import *

from transformer_lens.rs.callum2.ioi_and_bos.ioi_functions import (
    project,
)
from transformer_lens.rs.callum2.utils import (
    project,
    get_effective_embedding,
)

In [None]:

model = HookedTransformer.from_pretrained(
    "gpt2-small",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    device="cpu" # "cuda"
    # fold value bias?
)
model.set_use_split_qkv_input(False)
model.set_use_attn_result(True)

W_EE_dict = get_effective_embedding(model, use_codys_without_attention_changes=False)

clear_output()

W_EE_dict = get_effective_embedding(model, use_codys_without_attention_changes=False)
W_EE = W_EE_dict['W_E (including MLPs)']
W_E = W_EE_dict['W_E (no MLPs)']
W_EE0 = W_EE_dict['W_E (only MLPs)']
W_EE0A = W_EE - W_E

In [63]:
BOS_embed_W_E = W_E[model.tokenizer.bos_token_id]
BOS_embed_W_EE = W_EE[model.tokenizer.bos_token_id]
BOS_embed_W_EE0 = W_EE0[model.tokenizer.bos_token_id]
BOS_embed_W_EE0A = W_EE0A[model.tokenizer.bos_token_id]

assert BOS_embed_W_E.shape == (model.cfg.d_model,)

BOS_embed_key_W_E = BOS_embed_W_E @ model.W_K[10, 7]
BOS_embed_key_W_EE = BOS_embed_W_EE @ model.W_K[10, 7]
BOS_embed_key_W_EE0 = BOS_embed_W_EE0 @ model.W_K[10, 7]
BOS_embed_key_W_EE0A = BOS_embed_W_EE0A @ model.W_K[10, 7]

query_bias = model.b_Q[10, 7]

assert BOS_embed_key_W_E.shape == query_bias.shape == (model.cfg.d_head,)

In [64]:
t.cosine_similarity(BOS_embed_key_W_EE, query_bias, dim=0)

tensor(-0.1791)

In [65]:
t.cosine_similarity(BOS_embed_key_W_E, query_bias, dim=0)

tensor(0.3744)

In [66]:
t.cosine_similarity(BOS_embed_key_W_EE0, query_bias, dim=0)

tensor(-0.3941)

In [67]:
t.cosine_similarity(BOS_embed_key_W_EE0A, query_bias, dim=0)

tensor(-0.1927)

Some of these are pretty extreme!

But wait, there's more. I know exactly what the value of the residual stream at the BOS token is. 

In [84]:
bos_and_the_input = t.tensor([
    [model.tokenizer.bos_token_id],
    [model.to_single_token("the")],
])

_, cache = model.run_with_cache(
    bos_and_the_input,
    return_type = None,
    names_filter = lambda name: name == utils.get_act_name("k", 10)
)

k = cache["k", 10][:, 0, 7]
assert k.shape == (2, model.cfg.d_head,)

In [85]:
t.cosine_similarity(k[0], query_bias, dim=0)

tensor(0.4672)

In [86]:
t.cosine_similarity(k[1], query_bias, dim=0)

tensor(0.4701)

In [95]:
all_tokens = t.arange(model.cfg.d_vocab).unsqueeze(1)

_, cache = model.run_with_cache(
    all_tokens,
    return_type = None,
    names_filter = lambda name: name == utils.get_act_name("k", 10)
)

k = cache["k", 10][:, 0, 7]
assert k.shape == (model.cfg.d_vocab, model.cfg.d_head)

In [97]:
cos_sim = t.cosine_similarity(
    k, 
    einops.repeat(query_bias, "d_head -> N d_head", N=model.cfg.d_vocab),
    dim=1
)

In [101]:
hist(cos_sim, title="Cos sim of 0th token (key-side) with query bias for 10.7<br>over all key-side tokens in vocab", labels={"x": "Cos sim"})

In [129]:
N = 2500

random_tokens = t.randint(0, model.cfg.d_vocab, (N, 40))

_, cache = model.run_with_cache(
    random_tokens,
    return_type = None,
    names_filter = lambda name: name == utils.get_act_name("k", 10)
)

k0 = cache["k", 10][:, 0, 7]
k1 = cache["k", 10][:, 9, 7]
k2 = cache["k", 10][:, 19, 7]
k3 = cache["k", 10][:, 39, 7]
assert k0.shape == (N, model.cfg.d_head)

query_bias_rep = einops.repeat(query_bias, "d_head -> N d_head", N=N)

cos_sim_0 = t.cosine_similarity(k0, query_bias_rep, dim=1)
cos_sim_1 = t.cosine_similarity(k1, query_bias_rep, dim=1)
cos_sim_2 = t.cosine_similarity(k2, query_bias_rep, dim=1)
cos_sim_3 = t.cosine_similarity(k3, query_bias_rep, dim=1)

hist(cos_sim_0, title=f"Cos sim of 0th token (key-side) with query bias for 10.7<br>over {N} random key-side tokens", labels={"x": "Cos sim"})
hist(cos_sim_1, title=f"Cos sim of 9th token (key-side) with query bias for 10.7<br>over {N} random key-side tokens", labels={"x": "Cos sim"})
hist(cos_sim_2, title=f"Cos sim of 19th token (key-side) with query bias for 10.7<br>over {N} random key-side tokens", labels={"x": "Cos sim"})
hist(cos_sim_3, title=f"Cos sim of 39th token (key-side) with query bias for 10.7<br>over {N} random key-side tokens", labels={"x": "Cos sim"})

In [125]:
from transformer_lens.cautils.notebook import *
from transformer_lens.rs.callum2.ioi_functions import (
    attn_scores_as_linear_func_of_queries,
)

batch = 150

ioi_dataset, ioi_cache = generate_data_and_caches(batch, model=model, only_ioi=True, prepend_bos=True)

result = ioi_cache["result", 9][range(batch), ioi_dataset.word_idx["end"], 9]
result_normalized = result / result.norm(dim=-1, keepdim=True)

io_dir = model.W_U.T[ioi_dataset.io_tokenIDs]
s_dir = model.W_U.T[ioi_dataset.s_tokenIDs]

W_Q = model.W_Q[10, 7]
b_Q = model.b_Q[10, 7]
b_Q_rep = einops.repeat(b_Q, "d_head -> batch d_head", batch=batch)

result_in_io_dir, result_in_io_perpdir = project(result_normalized, io_dir)
result_in_io_dir_as_query = result_in_io_dir @ W_Q
result_in_io_perpdir_as_query = result_in_io_perpdir @ W_Q
result_in_io_perpdir_bQ_dir_as_query, result_in_io_perpdir_bQ_perpdir_as_query = project(result_in_io_perpdir_as_query, b_Q_rep)

k_IO = ioi_cache["k", 9][range(batch), ioi_dataset.word_idx["IO"], 9]
k_S1 = ioi_cache["k", 9][range(batch), ioi_dataset.word_idx["S1"], 9]

avg_scores_from_io_dir = (result_in_io_dir_as_query @ k_IO.T).mean()
avg_scores_from_io_perpdir_bQ_dir = (result_in_io_perpdir_bQ_dir_as_query @ k_IO.T).mean()
avg_scores_from_io_perpdir_bQ_perpdir = (result_in_io_perpdir_bQ_perpdir_as_query @ k_IO.T).mean()

# avg_scores_from_io_dir, avg_scores_from_io_perpdir_bQ_dir, avg_scores_from_io_perpdir_bQ_perpdir

avg_scores_from_io_dir = (result_in_io_dir_as_query @ k_IO.T).mean()
avg_scores_from_io_perpdir_bQ_dir = (result_in_io_perpdir_bQ_dir_as_query @ k_IO.T).mean()
avg_scores_from_io_perpdir_bQ_perpdir = (result_in_io_perpdir_bQ_perpdir_as_query @ k_IO.T).mean()

avg_scores_from_io_dir, avg_scores_from_io_perpdir_bQ_dir, avg_scores_from_io_perpdir_bQ_perpdir

# io_cos_sim = t.cosine_similarity(result_in_io_dir_as_query, k_IO - k_S1, dim=-1).mean()
# io_perp_cos_sim = t.cosine_similarity(result_in_io_perpdir_as_query, k_IO - k_S1, dim=-1).mean()

# print(f"Cos sim with IO component: {io_cos_sim:.4f}")
# print(f"Cos sim with IO-perp component: {io_perp_cos_sim:.4f}\n")

# print(f"Cos sim with IO component (subtract S1 baseline): {io_cos_sim:.4f}")
# print(f"Cos sim with IO-perp component (subtract S1 baseline): {io_perp_cos_sim:.4f}\n")
# print(f"Cos sim with IO-perp and b_Q-perp component (subtract S1 baseline): {io_perp_cos_sim:.4f}\n")

(tensor(-0.0006), tensor(-0.0046), tensor(-0.0036))

In [119]:
result_in_io_perpdir_bQ_dir_as_query.shape

torch.Size([150, 64])

In [120]:
k_IO.shape

torch.Size([150, 64])

In [116]:
result_in_io_perp_dir_as_query.shape

torch.Size([150, 64])