# Fuzzy Induction Test

Fuzzy induction is an idea from "In-Context Learning and Induction Heads" where you get `[A] [B] ... [A*] [B*]`, where `*` denotes some kind of linguistic similarity.

Basically redoing ARENA 1.2: Intro to Mechinterp induction heads experiments with the following modification:


Experiment:
1. Assemble a collection of synonym or near-synonym pairs -- ideally these are all words that are a single token, for the cleanest version
2. Create a random sequence of words, followed by a "repeated sequence" of their synonyms.
3. Run all of the induction head experiments and see what happens!

## Setup

Haven't optimzied this, mostly copied wholesale from ARENA 1.2. Might be able to remove some imports.

In [None]:
import os
import sys
import torch as t
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
import numpy as np
import einops
from jaxtyping import Int, Float
import functools
from tqdm import tqdm
from IPython.display import display
from transformer_lens.hook_points import HookPoint
from transformer_lens import (
    utils,
    HookedTransformer,
    HookedTransformerConfig,
    FactoredMatrix,
    ActivationCache,
)
import circuitsvis as cv

from plotly_utils import imshow, hist, plot_comp_scores, plot_logit_attribution, plot_loss_difference

# Saves computation time, since we don't need it for the contents of this notebook
t.set_grad_enabled(False)

device = t.device('mps' if t.backends.mps.is_available() else 'cuda' if t.cuda.is_available() else 'cpu')
print("using device: ", device)

MAIN = __name__ == "__main__"

### Loading 2L Attn-Only Pretrained Transformer

In [None]:
from huggingface_hub import hf_hub_download

cfg = HookedTransformerConfig(
    d_model = 768,
    d_head = 64,
    n_heads = 12,
    n_layers = 2,
    n_ctx = 2048,
    d_vocab = 50278,
    attention_dir = 'causal',
    attn_only = 'True',
    tokenizer_name = 'EleutherAI/gpt-neox-20b',
    seed = 398,
    use_attn_result = True,
    normalization_type = None,  # default would be 'LN', which is layernorm
    positional_embedding_type = 'shortformer' # positional embedding only used for q and k, not for v? apparently makes induction heads more likely?
)

REPO_ID = "callummcdougall/attn_only_2L_half"
FILENAME = "attn_only_2L_half.pth"

weights_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)

model = HookedTransformer(cfg)
pretrained_weights = t.load(weights_path, map_location=device, weights_only=True)
model.load_state_dict(pretrained_weights)

In [19]:
def get_log_probs(
    logits: Float[Tensor, "batch posn d_vocab"], tokens: Int[Tensor, "batch posn"]
) -> Float[Tensor, "batch posn-1"]:
    log_probs = logits.log_softmax(dim=-1)
    # Get logprobs the first seq_len-1 predictions (so we can compare them with the actual next tokens)
    log_probs_for_tokens = (
        log_probs[:, :-1].gather(dim=-1, index=tokens[:, 1:].unsqueeze(-1)).squeeze(-1)
    )

    return log_probs_for_tokens

## Step 1: Creating list of synonym pairs

Steps:
1. Make a list of all the tokens of the model that are English words
2. Feed this list to Claude 3.5 Sonnet and ask for synonym pairs
3. Process Claude's list of synonym pairs

#### Creating word list

In [None]:
# load word list
with open('./dictionary_large.txt', 'r') as f:
    word_set = set(f.read().splitlines())
print(f"loaded list of {len(word_set)} English words")

# get tokens, remove initial spaces and then remove duplicates
all_tokens = model.tokenizer.convert_ids_to_tokens(range(model.cfg.d_vocab))
word_tokens_with_leading_space = []
word_tokens_without_leading_space = []
for i, token in enumerate(all_tokens):
    if token and token[0] == "Ä ": 
        token = token[1:]  # strip leading space
        if token in word_set:
            word_tokens_with_leading_space.append(token)
    elif token in word_set:
        word_tokens_without_leading_space.append(token)

print(f"Created lists of {len(word_tokens_with_leading_space)} words with leading space and {len(word_tokens_without_leading_space)} without.")

#### Turning Claude output into synonym pairs

In [None]:
# Asked Claude 3.5 Sonnet to generate synonym pairs
with open('./synonym_pairs.txt') as f:
    synonym_pair_strings = f.read().splitlines()
print(synonym_pair_strings[:10])

In [None]:
with_leading_space_set = set(word_tokens_with_leading_space)
without_leading_space_set = set(word_tokens_without_leading_space)
word_tokens_set = with_leading_space_set.union(without_leading_space_set)

def token_version(word: str) -> str:
    '''Put spaces back in front of words that should have spaces in front'''
    if word in word_tokens_with_leading_space:
        return ' ' + word
    return word

synonym_pairs = []
for word_pair in synonym_pair_strings:
    word1, word2 = word_pair.split(',')
    word2 = word2[1:]  # remove leading space (from Claude formatting)
    if word1 in word_tokens_set and word2 in word_tokens_set:
        synonym_pairs.append( (token_version(word1), token_version(word2)) )
print(f"List of {len(synonym_pairs)} pairs, starting with: ", synonym_pairs[:10])

# check that all words are a single token long
for word1, word2 in synonym_pairs:
    assert len(model.tokenizer.tokenize(word1)) == 1
    assert len(model.tokenizer.tokenize(word2)) == 1

## Step 2: Run and cache model

In [None]:
# def generate_synonym_string(
#         model: HookedTransformer, seq_len: int, batch: int = 1
# ) -> Int[Tensor, "batch full_seq_len"]:
#     """
#     Generates a sequence of repeated random tokens.
#     Output is rep_tokens: [batch, 1 + 2*seq_len]
#     """
#     d_vocab = model.cfg.d_vocab
#     prefix = (t.ones(batch, 1) * model.tokenizer.bos_token_id).long()
#     rand_tokens = t.randint(0, d_vocab, size=(batch, seq_len))
#     rep_tokens = einops.repeat(rand_tokens, 'batch seq -> batch (2 seq)')
#     rep_tokens = t.cat([prefix, rep_tokens], dim=1)
#     return rep_tokens

# print(model.to_str_tokens(generate_repeated_tokens(model, 5, 2)[1]))

# def run_and_cache_model_repeated_tokens(
#         model: HookedTransformer, seq_len: int, batch: int = 1
# ) -> tuple[Tensor, Tensor, ActivationCache]:
#     """
#     Generates a sequence of repeated random tokens, and runs the model on it, returning logits, tokens and cache

#     Should use the `generate_repeated_tokens` function above

#     Outputs are:
#         rep_tokens: [batch, 1+2*seq_len]
#         rep_logits: [batch, 1+2*seq_len, d_vocab]
#         rep_cache: The cache of the model run on rep_tokens
#     """
#     tokens = generate_repeated_tokens(model, seq_len, batch).to(device)
#     logits, cache = model.run_with_cache(tokens, return_type='logits')
#     return tokens, logits, cache
    
# seq_len = 50
# batch = 1
# (rep_tokens, rep_logits, rep_cache) = run_and_cache_model_repeated_tokens(model, seq_len, batch)
# rep_cache.remove_batch_dim()
# rep_str = model.to_str_tokens(rep_tokens)
# model.reset_hooks()
# log_probs = get_log_probs(rep_logits, rep_tokens).squeeze()

# print(f"Performance on the first half: {log_probs[:seq_len].mean():.3f}")
# print(f"Performance on the second half: {log_probs[seq_len:].mean():.3f}")

# plot_loss_difference(log_probs, rep_str, seq_len)