In [6]:
from codebook_features import models
import numpy as np
import pickle
from tqdm import tqdm
from transformers import GPTNeoXForCausalLM, AutoTokenizer, pipeline, set_seed
from matplotlib import pyplot as plt
from termcolor import colored

tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-70m-deduped",
)
orig_model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-70m-deduped")
model = models.wrap_codebook(model_or_path="EleutherAI/pythia-70m-deduped", pretrained_path="taufeeque/best-cb-model")

In [2]:
# base_dir = "2023-02-17_09-27-39/"
base_dir = "2023-03-02_21-17-36/"

tokens = np.load(base_dir + "tokens.npy", allow_pickle=True)
# cb_acts_complete = np.load("2023-02-17_09-27-39/cb_acts.npy", allow_pickle=True).item()
with open(base_dir + "cb_acts.pkl","rb") as f:
    cb_acts_complete = pickle.load(f)
metrics = np.load(base_dir + "metrics.npy", allow_pickle=True)
num_codes = 10_000

In [3]:
# tokens = np.load("2023-02-09_21-37-58/tokens.npy", allow_pickle=True)
# cb_acts_complete = np.load("2023-02-09_21-37-58/cb_acts.npy", allow_pickle=True).item()
# metrics = np.load("2023-02-09_21-37-58/metrics.npy", allow_pickle=True)
# num_codes = 1000

In [4]:
samples = 10_000
tokens_trunc = tokens[:samples]
cb_acts = {}
for k, v in cb_acts_complete.items():
    cb_acts[k] = v[:samples]


In [5]:
print(metrics)
print(list(cb_acts.keys()))
print(cb_acts["layer0_attn_ccb0"].shape)

{'eval_loss': 3.4699559211730957, 'eval_accuracy': 0.3778763440860215, 'eval_runtime': 289.5455, 'eval_samples_per_second': 34.537, 'eval_steps_per_second': 1.081, 'eval_dead_code_fraction/layer0': 0.0131, 'eval_MSE/layer0': 10.428322392272948, 'eval_mean_norm/layer0': 22.481287002563477, 'eval_max_norm/layer0': 27.701583862304688, 'eval_dead_code_fraction/layer1': 0.0859, 'eval_MSE/layer1': 29.021728021240225, 'eval_mean_norm/layer1': 27.37179946899414, 'eval_max_norm/layer1': 38.93690872192383, 'eval_dead_code_fraction/layer2': 0.4581, 'eval_MSE/layer2': 213.85468557128917, 'eval_mean_norm/layer2': 36.73466873168945, 'eval_max_norm/layer2': 65.44490814208984, 'eval_dead_code_fraction/layer3': 0.3518, 'eval_MSE/layer3': 81.8039717895508, 'eval_mean_norm/layer3': 33.063480377197266, 'eval_max_norm/layer3': 59.71989822387695, 'eval_dead_code_fraction/layer4': 0.2956, 'eval_MSE/layer4': 11810.415718750002, 'eval_mean_norm/layer4': 48.984153747558594, 'eval_max_norm/layer4': 74.9896545410

In [None]:
k, v = np.unique(cb_acts["layer0_attn_ccb1"], return_counts=True)
plt.bar(k, v)
plt.yscale('log')
plt.show()

v_sortind = np.argsort(-v)
k, v = k[v_sortind], v[v_sortind]
plt.bar(range(len(v)), v)
plt.yscale('log')
plt.show()

In [32]:

def features_to_tokens(cb_key,n=10):
    codebook_ids = cb_acts[cb_key]
    features_tokens = [[] for _ in range(num_codes)]
    for i in tqdm(range(codebook_ids.shape[0])):
        for j in range(codebook_ids.shape[1]):
            for k in range(codebook_ids.shape[2]):
                past_future_tuple = (tokens[i, max(0, j-n):j+1], tokens[i, j+1:j+n])
                features_tokens[codebook_ids[i,j,k]].append(past_future_tuple)

    return features_tokens

def features_to_tokens_fast(cb_key,n=10):
    codebook_ids = cb_acts[cb_key]
    features_tokens = [[] for _ in range(num_codes)]
    for i in tqdm(range(codebook_ids.shape[0])):
        for j in range(codebook_ids.shape[1]):
            for k in range(codebook_ids.shape[2]):
                features_tokens[codebook_ids[i,j,k]].append((i,j))

    return features_tokens


In [94]:
tokens[0, 1020:1030]

array([5234, 5971,  407, 6890])

In [71]:
cb_key = "layer0_attn_ccb0"
codebook_ids = cb_acts[cb_key]
# idxs = codebook_ids[:, :, :, None] == np.arange(1000)[None, None, None, :]
codebook_ids.shape
codebook_ids.max()

9999

In [33]:
ft_tkns = features_to_tokens_fast("layer0_attn_ccb0")

100%|█████████████████████████████████████████████████████████████████████████████| 10000/10000 [06:27<00:00, 25.80it/s]


In [39]:
ft_tkns2 = features_to_tokens_fast("layer2_attn_ccb0")

100%|█████████████████████████████████████████████████████████████████████████████| 10000/10000 [07:50<00:00, 21.25it/s]


In [40]:
ft_tkns5 = features_to_tokens_fast("layer5_attn_ccb0")

100%|█████████████████████████████████████████████████████████████████████████████| 10000/10000 [08:57<00:00, 18.59it/s]


In [36]:
def tkn_print(ll, n=3):
    for idx, (i, j) in enumerate(ll):
        print(f"{idx}: example - {i}, token - {j}")
        print(tokenizer.decode(tokens[i,max(0,j-n):j]), end='')
        print(colored(tokenizer.decode(tokens[i,j]), 'red'), end='')
        print(tokenizer.decode(tokens[i,j+1:j+n]))
        if idx > 100:
            break
    print("************************************************************")
    

In [37]:
def print_ft_tkns(ft_tkns, n=3):
    for i in range(len(ft_tkns)):
#         unique_tokens = set(ft_tkns[i])
#         if 0 < len(unique_tokens) < 100:
#             print(i, len(unique_tokens), len(ft_tkns[i]))
#             tkn_print(unique_tokens)
#             print("**********************************")
        tkns = ft_tkns[i]
        if len(tkns) > 0 and len(tkns) > 1000:
            print(f"code - {i}, mapped to {len(tkns)} tokens")
            tkn_print(tkns, n)
        if i > 1000:
            break

In [None]:
print_ft_tkns(ft_tkns,n=10)

In [None]:
print_ft_tkns(ft_tkns2,n=10)

In [None]:
print_ft_tkns(ft_tkns5,n=10)

In [None]:
import matplotlib.pyplot as plt

def token_to_dist_of_fts(token, layer_key):
    token_idx = tokenizer.convert_tokens_to_ids(token)
    print(token_idx)
    layer_cb_acts = cb_acts[layer_key]
    token_present = tokens == token_idx
    fts_of_token = layer_cb_acts[token_present]
    print(fts_of_token.shape)
    plt.hist(fts_of_token[:, :3], bins=100, label=["1", "2", "3"])
    plt.legend()
    plt.title(f'Features Queried | Layer = {layer_key}')
    plt.show()

token_to_dist_of_fts('ĠAfrica', 'layer0_attn_ccb0')
token_to_dist_of_fts('ĠAfrica', 'layer1_attn_ccb0')
token_to_dist_of_fts('ĠAfrica', 'layer2_attn_ccb0')
token_to_dist_of_fts('ĠAfrica', 'layer3_attn_ccb0')
token_to_dist_of_fts('ĠAfrica', 'layer4_attn_ccb0')
token_to_dist_of_fts('ĠAfrica', 'layer5_attn_ccb0')

In [2]:
model = models.wrap_codebook(model_or_path="EleutherAI/pythia-70m-deduped", pretrained_path="taufeeque/pythia-attn-cb")

In [137]:
prefixes = ["Hello I am", "What is the", "The largest", "One of the most", "Your time is limited", "Keep your friends close, but", "Once upon a time,"]
# prefixes = [""]

In [139]:
orig_generator = pipeline('text-generation', model=orig_model, tokenizer=tokenizer)
for prefix in prefixes:
    for i in range(3):
        print(orig_generator(prefix, max_length=50, do_sample=True, temperature=0.7)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Hello I am a non-member of a team in the USA.

My question is: How do I make a roster in one team?

I am not a member of the Team in the USA. I am in the USA and


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Hello I am a newbie with a background in UI.
<xmpp> i'm using a simple text editor but i don't use the web page when it's a text editor
<xmpp> and i've already found


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Hello I am getting this strange behavior. I am using the below code:
    <script src="https://code.jquery.com/jquery-2.1.1.min.js"></script>

<script>
    


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


What is the first half hour of the second hour? What is the third hour of the fourth hour? A second half hour of the fifth hour is the sixth hour of the fifth hour.

With the last hour of the seventh, the first


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


What is the remainder when (4 - 2)*-5*2 is divided by 2?
4
Let u = -11 + 17. Let w = u - -10. Calculate the remainder when 10 is divided by w.
2



Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


What is the common denominator of 101/14 and (3/7)/((-8)/6)?
5
Let p = -7 + 12. Suppose 0*o = -p*o + 12. What is the least common multiple of o


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


The largest company of the U.S. is a US firm based in Beijing, also known as Beijing’s Tiananmen Square, which was built in the 1950s.

China has also been the world’s leading supplier of Chinese


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


The largest in the country, with around $7 billion at the time, was one of the largest in its history.

The biggest is the first of the two major cities on the island, which include Soho, the city of Gis


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


The largest group of people in the world is the United States of America, and it's the largest group of American people in the world.

A few years ago, I had the opportunity to write a book about this story and I was struck


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


One of the most important questions to ask when deciding a particular product will be to determine whether a particular product is a particular product, or whether it is a particular product.

Once a product is a particular product, it should be determined how the


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


One of the most popular
-

Troy, I have been thinking of a new product called "Baked Soy". It is called "Soy" because it is the most popular, and most popular, brand of, and one


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


One of the most exciting and the most innovative of the new generation of online retailers, the CME has been a tremendous success.

The CME has been a huge success, and I have been proud to be a part of its new and


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Your time is limited to just one hour before the event."

"Yeah, it's just one hour before the event, right?"

"I'll have it covered."

"Yeah, I guess so. You were right.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Your time is limited to $7.10. If the time is limited to $7.10, the person with the most recent payment will be in no position to take any kind of payment. The person with the most recent payment will be in


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Your time is limited.

The reason I am doing the post is because I am not sure why this is happening. I am just explaining my position, but I am confused.

I have a comment and something to do is this,


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Keep your friends close, but you are welcome to share yours with others.

We want to make sure that you are on the right track when you’re coming, and that your friends are staying safe as far as possible. You can also


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Keep your friends close, but they will always be there for you."

You do not have to lie. Remember to share with others.

When you are not alone, you will be with them.

I am an adult.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Keep your friends close, but that's what you make of him."

"How do you know that?"

"I never knew you'd be here," she said. "I never expected you to tell me what you do, and


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Once upon a time, my daughter used to be a very active participant in social distancing.

I had a friend who was in a group together who knew his wife, and he was on the run for a bit for it. She was


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Once upon a time, the same is true of the former, at the time of the loss of the former and the present, of the latter.

The problem is with the loss of the former. For example, the loss of the former
Once upon a time, I just found that the
washers were not so simple as to be able to handle the entire
work, and it was not the time for me to simply do the same. I
had a few different questions about


In [140]:
from transformers import pipeline, set_seed

generator = pipeline('text-generation', model=model.model, tokenizer=tokenizer)
for prefix in prefixes:
    for i in range(3):
        print(generator(prefix, max_length=50, do_sample=True, temperature=0.7)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Hello I am a child to be a person of the family. " 
 = = = Reception = = = 
 " The episode of the episode " is a guest @-@ cast @-@ action thriller, which includes guest @-@


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Hello I amai's wife ". The first female of the American writer of the American columnist, and was the authorial, having been inspired by his earlier and @-@ fiction that his " character was " more popular in the United States


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Hello I amassi, who are involved in his father's family. He also made a post @-@ year in his law of the law of Parliament of the United States. 
 = = = Aftermath = = = 
 In the


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


What is the only " an example of the <unk> " of the same one of the main theme ". 
 The " the " most remarkable of the original work in the poem of an ancient church and one of the most important examples of the


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


What is the " God gods @-@ the " [ <unk> ] ]. " 
 = = = = Formation = = = = 
 = = = = = = = 
 = = = = = = = = = 
 " I


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


What is the three @-@-@ four @-@ game game game, and their best game games. 
 = = = Gameplay = = = 
 = = = Gameplay = = = 
 Gameplay is a first game for the


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


The largest of the United States. The organization of the university is a private student in the 1980s, and the school has also the most successful school in the area. The first school on both the National University of State University of State University was the


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


The largest of the United States of the World of the United States. The state center of the United States, U.S. and other regions, the area of U.S. lies the state of the United States, <unk>, and


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


The largest modern world. The name of the Rata is probably more than a form of an unconnected world with the city's " s ". 
 = = = = Background = = = = 
 The first name in the " Kingdom of


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


One of the most of the worst @-@ ranking year @-@ ranking and for his final campaign in the same time. 
 At the beginning of the events of the season of the summer, the race is an " a " one @-


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


One of the most American. 
 = = = Reception = = = 
 The song was written in a live video in 1999, and has been produced in the UK, The Independent of the Year, as well as the third song for the second


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


One of the most notable figures of the works of science fiction, and the novel, the two of the novel, was the first novel, written by the <unk>'s use of the characters of his work in the series of the film that


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Your time is limited and replaced with a large number of seats to the individual @-@ half all of the same. 
 = = = World War I = = = 
 The First World War I was the first time and first day. Despite the


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Your time is limited to the remaining. However of which the majority of the way to be in a point of the nature of the region are an area of the center of the area, while the population has a number of the most important areas of the


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Your time is limited to a large number of the population. 
 = = = = = 
 The population is a very low population @-@ 5 in the north of the Great Britain and the South Australian border in the region. The population is located


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Keep your friends close, but he had had been the same and asked about to be a woman. 
 In fact, the actor was also a fan of the character and the director of the character. In the series of the series, a series of


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Keep your friends close, but, he, he and his father's relationship. 
 = = = = = = 
 = = = = = = = = = = 
 In his tenure, he was described as the first @-@ year


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Keep your friends close, but to which their families were to have the house. 
 = = = Other things that are used for the Lord of Lord of the Earl of England and his father of William H. HMS, they had been at the


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Once upon a time, the player is having a career ; " the game is a good and the guy. " 
 The game was released in the United States in the World War I, who gave his first role as the first @-@ game


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Once upon a time,, the fate of this was so many of the world's efforts. 
 = = = Geography of the world = = = = 
 The Great East, the first of the Northern United Nations and the World War (
Once upon a time,, these were not seen in " the episode ", but the episode was originally aired on The MHS and the Fox Show. The film premiered on the Fox Show at the Fox Show. The film was written by the film


In [2]:
best_cb_model = models.wrap_codebook(model_or_path="EleutherAI/pythia-70m-deduped", pretrained_path="taufeeque/pythia-100mc")

Some weights of the model checkpoint at taufeeque/pythia-100mc were not used when initializing GPTNeoXCodebookModel: ['model.gpt_neox.layers.2.attention.codebook_layer.codebook.0.counts', 'model.gpt_neox.layers.1.attention.codebook_layer.codebook.0.counts', 'model.gpt_neox.layers.5.attention.codebook_layer.codebook.0.counts', 'model.gpt_neox.layers.0.attention.codebook_layer.codebook.0.counts', 'model.gpt_neox.layers.3.attention.codebook_layer.codebook.0.counts', 'model.gpt_neox.layers.4.attention.codebook_layer.codebook.0.counts']
- This IS expected if you are initializing GPTNeoXCodebookModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPTNeoXCodebookModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassifica

In [89]:
from transformers import pipeline, set_seed

best_cb_generator = pipeline('text-generation', model=best_cb_model.model, tokenizer=tokenizer)
for prefix in prefixes:
    for i in range(3):
        print(best_cb_generator(prefix, max_length=50, do_sample=True, temperature=0.7)[0]['generated_text'])

NameError: name 'prefixes' is not defined

In [56]:
help(tokenizer)

Help on GPTNeoXTokenizerFast in module transformers.models.gpt_neox.tokenization_gpt_neox_fast object:

class GPTNeoXTokenizerFast(transformers.tokenization_utils_fast.PreTrainedTokenizerFast)
 |  GPTNeoXTokenizerFast(vocab_file=None, merges_file=None, tokenizer_file=None, unk_token='<|endoftext|>', bos_token='<|endoftext|>', eos_token='<|endoftext|>', add_prefix_space=False, **kwargs)
 |  
 |  Construct a "fast" GPT-NeoX-20B tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
 |  Byte-Pair-Encoding.
 |  
 |  This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
 |  be encoded differently whether it is at the beginning of the sentence (without space) or not:
 |  
 |  ```
 |  >>> from transformers import GPTNeoXTokenizerFast
 |  >>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("gpt2")
 |  >>> tokenizer("Hello world")['input_ids']
 |  [15496, 995]
 |  >>> tokenizer(" Hello world")['input_ids']
 |  

In [13]:
from codebook_features import models
import numpy as np
import pickle
from tqdm import tqdm
from transformers import GPTNeoXForCausalLM, AutoTokenizer
from matplotlib import pyplot as plt
from termcolor import colored

tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-70m-deduped",
)
orig_model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-70m-deduped")
model = models.wrap_codebook(model_or_path="EleutherAI/pythia-70m-deduped", pretrained_path="taufeeque/pythia-100mc")

vis = {}


def hook_fn(m, i, o):
    vis[m] = (i, o)

vis_attn = {}
    
def hook_fn_attn(m, i, o):
    vis_attn[m] = (i, o)

    
for name, layer in model.model.gpt_neox.layers._modules.items():
    print(name)
    layer.attention.codebook_layer.register_forward_hook(hook_fn)
    layer.attention.register_forward_hook(hook_fn_attn)


s = "This tokenizer has been trained to treat spaces like parts of the tokens random tokens added here"
inp = tokenizer(s, return_tensors="pt")
out = model(**inp)


Some weights of the model checkpoint at taufeeque/pythia-100mc were not used when initializing GPTNeoXCodebookModel: ['model.gpt_neox.layers.3.attention.codebook_layer.codebook.0.counts', 'model.gpt_neox.layers.5.attention.codebook_layer.codebook.0.counts', 'model.gpt_neox.layers.2.attention.codebook_layer.codebook.0.counts', 'model.gpt_neox.layers.1.attention.codebook_layer.codebook.0.counts', 'model.gpt_neox.layers.4.attention.codebook_layer.codebook.0.counts', 'model.gpt_neox.layers.0.attention.codebook_layer.codebook.0.counts']
- This IS expected if you are initializing GPTNeoXCodebookModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPTNeoXCodebookModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassifica

0
1
2
3
4
5


In [14]:
for i, layers in model.all_codebooks.items():
    for cb in layers:
        print(cb.reconstruction_mse)
        print('cb_norm',cb.codebook[0].avg_norm())
#         print(cb.codebook[0].input.norm())
#         print(cb.codebook[0].output.norm())

44.75636291503906
cb_norm 22.606342315673828
150.0145721435547
cb_norm 22.624954223632812
953.9581298828125
cb_norm 22.625259399414062
567.1958618164062
cb_norm 22.624855041503906
977.8589477539062
cb_norm 22.627485275268555
3250.580322265625
cb_norm 22.615549087524414


In [3]:
model.reset_codebook_metrics()

In [22]:
import importlib
importlib.reload(models)


<module 'codebook_features.models' from '/home/taufeeque/codebook-features/codebook_features/models.py'>

In [None]:
24.59918189048767
58.6542329788208
1281.2109985351562
1219.846923828125
2715.999969482422
8037.982604980469


In [3]:
model = best_cb_model

In [24]:
print(len(vis))
print(vis.keys())


6
dict_keys([CompositionalCodebookLayer(
  (codebook): ModuleList(
    (0): CodebookLayer(
      (codebook): Embedding(10000, 512)
    )
  )
), CompositionalCodebookLayer(
  (codebook): ModuleList(
    (0): CodebookLayer(
      (codebook): Embedding(10000, 512)
    )
  )
), CompositionalCodebookLayer(
  (codebook): ModuleList(
    (0): CodebookLayer(
      (codebook): Embedding(10000, 512)
    )
  )
), CompositionalCodebookLayer(
  (codebook): ModuleList(
    (0): CodebookLayer(
      (codebook): Embedding(10000, 512)
    )
  )
), CompositionalCodebookLayer(
  (codebook): ModuleList(
    (0): CodebookLayer(
      (codebook): Embedding(10000, 512)
    )
  )
), CompositionalCodebookLayer(
  (codebook): ModuleList(
    (0): CodebookLayer(
      (codebook): Embedding(10000, 512)
    )
  )
)])


In [72]:
for module, (i, o) in vis.items():
    i, o = i[0], o
    print(i.norm(p=2,dim=-1).mean().item(), o.norm(p=2,dim=-1).mean().item())
    print((((i-o)).norm(p=2,dim=-1)**2).mean().item())

7.211728572845459 2.2341723442077637
44.75636291503906
13.70687484741211 2.742445230484009
150.0145721435547
32.99772262573242 3.2736246585845947
953.9581298828125
25.82971954345703 3.1037824153900146
567.1959228515625
33.30326843261719 3.3043413162231445
977.8590698242188
58.66970443725586 3.357837438583374
3250.580322265625


In [11]:
# model.disable_codebooks()
model.reset_codebook_metrics()
vis_attn = {}
out = model(**inp)

for module, (i, o) in vis_attn.items():
    i, o = i[0], o[0]
    print(o.norm(p=2,dim=-1).mean().item())

7.211728572845459
11.842400550842285
29.040924072265625
22.245229721069336
13.874805450439453
21.93659019470215


In [10]:
model.all_codebooks
# wrapper = list(vis_attn.keys())[0]
# print(wrapper.snap)
for layer in vis_attn:
    layer.snap = False

In [44]:
def get_norm(t):
    return t.norm(p=2,dim=-1).mean().item()

In [48]:
last_cb = model.all_codebooks[5][0].codebook[0]

last_o = last_cb(i)
print(get_norm(last_o))

3.357837438583374


In [53]:
import torch
logits = -torch.cdist(i, last_cb.codebook.weight, p=2)
_, codebook_ids = logits.topk(models.BaseSnapFunction.k, dim=-1)
outputs = torch.nn.functional.embedding(codebook_ids, last_cb.codebook.weight)

In [56]:
print(get_norm(outputs))
outputs_avg = outputs.sum(dim=-2) / models.BaseSnapFunction.k
print(get_norm(outputs_avg))

22.306095123291016
3.357837438583374


In [57]:
outputs.shape

torch.Size([1, 18, 100, 512])

In [58]:
outputs_avg.shape

torch.Size([1, 18, 512])

In [71]:
orig_vis = {}


def orig_hook_fn(m, i, o):
    orig_vis[m] = (i, o)

for name, layer in orig_model.gpt_neox.layers._modules.items():
    print(name)
    layer.attention.register_forward_hook(orig_hook_fn)

orig_out = orig_model(**inp)

0
1
2
3
4
5


In [70]:
for module, (i, o) in orig_vis.items():
    i, o = i[0], o[0]
    print(o.norm(p=2,dim=-1).mean().item())

3.2817928791046143
3.281010866165161
4.416781425476074
3.543278455734253
2.703380584716797
14.310317993164062


In [65]:
orig_vis.keys()

dict_keys([])

In [63]:
orig_out

CausalLMOutputWithPast(loss=None, logits=tensor([[[104.0447,  84.4274, 139.5462,  ...,  84.4247,  84.4243,  84.4238],
         [104.6174,  84.4767, 139.2081,  ...,  84.4752,  84.4737,  84.4747],
         [108.2611,  88.2957, 144.4648,  ...,  88.2942,  88.2946,  88.2932],
         ...,
         [109.2437,  89.6794, 144.8795,  ...,  89.6781,  89.6777,  89.6771],
         [106.6640,  87.3825, 146.6189,  ...,  87.3811,  87.3813,  87.3805],
         [107.4439,  87.2109, 147.9247,  ...,  87.2099,  87.2096,  87.2092]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-1.5175e-01, -2.1268e-01,  5.9498e-01,  ..., -1.4740e+00,
           -2.6675e+00,  8.3408e-01],
          [-6.1415e-01, -1.0304e+00,  5.5834e-01,  ..., -2.5103e+00,
           -2.3254e+00,  2.7814e+00],
          [-3.0116e-01, -7.6867e-01,  2.0433e-01,  ..., -1.4666e+00,
           -1.4205e+00,  1.8040e+00],
          ...,
          [ 6.0656e-01,  5.9117e-01, -3.4602e-01,  ..., -3.5330e+00,
           -2.3229e

In [31]:
import re
pattern = "\n\s*=[^=]*=\s*\n"
def color_pattern(s, p=pattern):
    """colors p in s using termcolor"""
    return re.sub(p, colored(r"\g<0>", "blue", attrs=['bold']), s)

# for idx in range(len(tokens)):
#     example = tokens[idx]
#     print(color_pattern(tokenizer.decode(example)))
#     if idx > 100:
#         break

for idx in range(len(tokens)):
    example = tokens[idx]
    s = tokenizer.decode(example)
    matches = re.findall(pattern, s)
    if len(matches) > 0:
        print(idx, matches)

4 ['\n = Tower Building of the Little Rock Arsenal = \n']
8 ['\n = Cicely Mary Barker = \n']
12 ["\n = Gambia women's national football team = \n"]
13 ['\n = Plain maskray = \n']
14 ['\n = 2011 – 12 Columbus Blue Jackets season = \n']
18 ['\n = Position ; GP = \n', '\n = Goals ; A = \n', '\n = Points ; PIM = \n', '\n = Games Played ; TOI = \n', '\n = Wins ; L = \n', '\n = Overtime Losses ; GA = \n', '\n = Saves ; Sv % = \n']
19 ['\n = Gregorian Tower = \n']
21 ["\n = There's Got to Be a Way = \n"]
22 ['\n = Nebraska Highway 88 = \n', '\n = USS Atlanta ( 1861 ) = \n']
26 ['\n = Jacqueline Fernandez = \n']
29 ['\n = John Cullen = \n']
32 ['\n = SMS Erzherzog Ferdinand Max = \n']
33 ['\n = Ancient Egyptian deities = \n']
44 ['\n = South of Heaven = \n']
47 ['\n = General aviation in the United Kingdom = \n']
55 ['\n = SMS Zrínyi = \n']
57 ['\n = Geopyxis carbonaria = \n']
59 ['\n = Gold dollar = \n']
64 ['\n = Johnson – Corey – Chaykovsky reaction = \n']
67 ['\n = Treaty of Ciudad Juárez 

In [23]:
generator = pipeline('text-generation', model=model.model, tokenizer=tokenizer)
prefix = "When Mary and John went to the store, John gave a drink to"
print(generator(prefix, max_length=50, do_sample=True, temperature=0.7)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


When Mary and John went to the store, John gave a drink to a family member. 
 = = = The householder = = = 
 The house was built by the architect William Horton, in the early 17th century. When the house


In [19]:
orig_generator = pipeline('text-generation', model=orig_model, tokenizer=tokenizer)
print(orig_generator(prefix, max_length=50, do_sample=True, temperature=0.7)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


When Mary and John went to the store, John gave a drink to Mary’s mother of the children and said, “We’re going to get you something nice for me and I’m going to get you something good for me and I


In [24]:
! pip install git+https://github.com/neelnanda-io/TransformerLens



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting git+https://github.com/neelnanda-io/TransformerLens
  Cloning https://github.com/neelnanda-io/TransformerLens to /tmp/pip-req-build-z8brjx98
  Running command git clone -q https://github.com/neelnanda-io/TransformerLens /tmp/pip-req-build-z8brjx98
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
[?25hCollecting rich<13.0.0,>=12.6.0
  Downloading rich-12.6.0-py3-none-any.whl (237 kB)
[K     |████████████████████████████████| 237 kB 30.1 MB/s eta 0:00:01
[?25hCollecting jaxtyping<0.3.0,>=0.2.11
  Downloading jaxtyping-0.2.14-py3-none-any.whl (20 kB)
Collecting einops<0.7.0,>=0.6.0
  Download

In [31]:
import transformer_lens
import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookedRootModule,
    HookPoint,
)  # Hooking utilities
from transformer_lens import HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache

model = HookedTransformer.from_pretrained(
    "taufeeque/tiny-gpt2",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    refactor_factored_attn_matrices=True,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ValueError: taufeeque/tiny-gpt2 not found. Valid official model names (excl aliases): ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl', 'distilgpt2', 'facebook/opt-125m', 'facebook/opt-1.3b', 'facebook/opt-2.7b', 'facebook/opt-6.7b', 'facebook/opt-13b', 'facebook/opt-30b', 'facebook/opt-66b', 'EleutherAI/gpt-neo-125M', 'EleutherAI/gpt-neo-1.3B', 'EleutherAI/gpt-neo-2.7B', 'EleutherAI/gpt-j-6B', 'EleutherAI/gpt-neox-20b', 'stanford-crfm/alias-gpt2-small-x21', 'stanford-crfm/battlestar-gpt2-small-x49', 'stanford-crfm/caprica-gpt2-small-x81', 'stanford-crfm/darkmatter-gpt2-small-x343', 'stanford-crfm/expanse-gpt2-small-x777', 'stanford-crfm/arwen-gpt2-medium-x21', 'stanford-crfm/beren-gpt2-medium-x49', 'stanford-crfm/celebrimbor-gpt2-medium-x81', 'stanford-crfm/durin-gpt2-medium-x343', 'stanford-crfm/eowyn-gpt2-medium-x777', 'EleutherAI/pythia-70m', 'EleutherAI/pythia-160m', 'EleutherAI/pythia-410m', 'EleutherAI/pythia-1b', 'EleutherAI/pythia-1.4b', 'EleutherAI/pythia-2.8b', 'EleutherAI/pythia-6.9b', 'EleutherAI/pythia-12b', 'EleutherAI/pythia-70m-deduped', 'EleutherAI/pythia-160m-deduped', 'EleutherAI/pythia-410m-deduped', 'EleutherAI/pythia-1b-deduped', 'EleutherAI/pythia-1.4b-deduped', 'EleutherAI/pythia-2.8b-deduped', 'EleutherAI/pythia-6.9b-deduped', 'EleutherAI/pythia-12b-deduped', 'NeelNanda/SoLU_1L_v9_old', 'NeelNanda/SoLU_2L_v10_old', 'NeelNanda/SoLU_4L_v11_old', 'NeelNanda/SoLU_6L_v13_old', 'NeelNanda/SoLU_8L_v21_old', 'NeelNanda/SoLU_10L_v22_old', 'NeelNanda/SoLU_12L_v23_old', 'NeelNanda/SoLU_1L512W_C4_Code', 'NeelNanda/SoLU_2L512W_C4_Code', 'NeelNanda/SoLU_3L512W_C4_Code', 'NeelNanda/SoLU_4L512W_C4_Code', 'NeelNanda/SoLU_6L768W_C4_Code', 'NeelNanda/SoLU_8L1024W_C4_Code', 'NeelNanda/SoLU_10L1280W_C4_Code', 'NeelNanda/SoLU_12L1536W_C4_Code', 'NeelNanda/GELU_1L512W_C4_Code', 'NeelNanda/GELU_2L512W_C4_Code', 'NeelNanda/GELU_3L512W_C4_Code', 'NeelNanda/GELU_4L512W_C4_Code', 'NeelNanda/Attn_Only_1L512W_C4_Code', 'NeelNanda/Attn_Only_2L512W_C4_Code', 'NeelNanda/Attn_Only_3L512W_C4_Code', 'NeelNanda/Attn_Only_4L512W_C4_Code', 'NeelNanda/Attn-Only-2L512W-Shortformer-6B-big-lr', 'NeelNanda/SoLU_1L512W_Wiki_Finetune', 'NeelNanda/SoLU_4L512W_Wiki_Finetune']