# A) load data

After this step, you should end up with an array with all the sentences in your dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("trivia_qa", 'rc.nocontext')

  from .autonotebook import tqdm as notebook_tqdm
Reusing dataset trivia_qa (/n/home10/cyeh/.cache/huggingface/datasets/trivia_qa/rc.nocontext/1.2.0/e73c5e47a8704744fa9ded33504b35a6c098661813d1c2a09892eb9b9e9d59ae)
100%|██████████| 3/3 [00:00<00:00, 119.11it/s]


In [2]:
test_data = dataset['validation']
print(len(test_data))

17944


## sample from data

In [3]:
# prepare for sampling
import random
num_examples = 250
random.seed(10) # set seed
indices = random.sample(range(len(test_data)), num_examples)
indices[:10]

In [12]:
# pick 'num_examples' random Q/A pairs
sentences = []
sent_lengths = []
for i in indices:
    point = test_data[i]
    question = point['question'].lower()
    if question[-1] != '?': # add question mark if needed
        question += '?'
    answer = point['answer']['value'].lower()
    sent = question + " " + answer # create sentence from q/a pair
    sentences.append(sent) # add to list
    sent_lengths.append(len(sent.split()))
    
sentences[:10]

['in 1968, who did radical feminist valerie solanas shoot and wound as he entered his new york studio? andy warhol',
 'what lake can be found on the border of vermont and new york? lake champlain',
 'which competition was won by nadiya hussain in 2015? the great british bake-off',
 'which `b` was the name of the mechanical shark used in the original `jaws` film? bruce',
 'who is the current (jan 2014) secretary of state for education? michael gove',
 'which cocktail consists of rum, curacao and lime juice? mai tai',
 "which 2001 film starring joseph fiennes and ed harris is about two opposing snipers facing each other during the battle of stalingrad? 'enemy at the gates'",
 "who had a 1992 hit with you're the one for me fatty? morrissey",
 'cassiterite is a principal ore of which metal? tin',
 'what was the name of the band, featuring members of thin lizzy and the sex pistols, which recorded a 1979 song called ‘a merry jingle’? the greedies (originally the greedy bastards)']

In [13]:
# look at distribution of sentences
max_len = max(sent_lengths)
min_len = min(sent_lengths)
mean_len = sum(sent_lengths)/len(sent_lengths)

print(max_len)
print(min_len)
print(mean_len)

52
6
15.256


In [2]:
# save sentences if desired (to load back, use: np.load("qa_sentences.npy"))
import numpy as np
np.save("qa_sentences.npy", sentences)

# b) get necessary info

Here we use helper methods from Jesse Vig's bertviz repo: https://github.com/jessevig/bertviz
* the cell below may take a while to run... this is normal

In [4]:
# if needed, load back sentences
sentences = np.load("qa_sentences.npy")

In [5]:
# import methods from bertviz
from bertviz import neuron_view
from bertviz.transformers_neuron_view import BertModel, BertTokenizer, GPT2Model, GPT2Tokenizer
import string

# BERT
# model_type = 'bert'
# model_version = 'bert-base-uncased'
# model = BertModel.from_pretrained(model_version, output_attentions=True)
# tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=True)

# GPT
model_type = 'gpt2'
model_version = 'gpt2'
model = GPT2Model.from_pretrained(model_version, output_attentions=True)
tokenizer = GPT2Tokenizer.from_pretrained(model_version, do_lower_case=True)

# may need to change depending on your model
num_heads = 12
num_layers = 12

# master dictionary for all values
attn_dict = {'left_text': [], 
             'right_text': [], 
             'positions': [],
             'normalized_positions': [],
             'sentences': [],
             'tokenized_sentences': [],
             'q_norms': [[[''.join(random.choices(string.ascii_letters, k=5))] for i in range(num_heads)] for j in range(num_layers)], 
             'k_norms': [[[''.join(random.choices(string.ascii_letters, k=5))] for i in range(num_heads)] for j in range(num_layers)], 
             'queries': [[[''.join(random.choices(string.ascii_letters, k=5))] for i in range(num_heads)] for j in range(num_layers)], 
             'keys': [[[''.join(random.choices(string.ascii_letters, k=5))] for i in range(num_heads)] for j in range(num_layers)],
             'attn': [[[''.join(random.choices(string.ascii_letters, k=5))] for i in range(num_heads)] for j in range(num_layers)],
             'dot_prod': [[[''.join(random.choices(string.ascii_letters, k=5))] for i in range(num_heads)] for j in range(num_layers)]}

for s in sentences:
    # call method from bertviz to get attention info
    s_dict = neuron_view.get_attention(model, model_type, tokenizer, s, include_queries_and_keys=True)['all']
    
    # append to master dictionary
    tokens = s_dict['left_text']
    attn_dict['left_text'].extend(tokens)
    attn_dict['right_text'].extend(s_dict['right_text'])
    
    for index in range(len(tokens)): # save position of token and tokenized sentences too
        attn_dict['positions'].append(index)
        attn_dict['normalized_positions'].append(index / (len(tokens) - 1))
        attn_dict['sentences'].append(s)
        attn_dict['tokenized_sentences'].append(' '.join(tokens))
        
    for i in range(num_layers): # updating cumulative q/k vectors + attn + dp
        for j in range(num_heads):
            q = attn_dict['queries'][i][j]
            k = attn_dict['keys'][i][j]
            a = attn_dict['attn'][i][j]
            d = attn_dict['dot_prod'][i][j]
            q_norm = attn_dict['q_norms'][i][j]
            k_norm = attn_dict['k_norms'][i][j]
            
            if len(q) == 1: # on first round, need to empty list (random string was placeholder)
                q.clear()
            query = s_dict['queries'][i][j]
            q.extend(query)
            np_query = np.array(query)

            if len(k) == 1:
                k.clear()
            key = s_dict['keys'][i][j]
            k.extend(key)
            np_key = np.array(key)
            
            if len(a) == 1:
                a.clear()
            a.extend(s_dict['attn'][i][j])
            
            if len(d) == 1:
                d.clear()
            dp = np.dot(np_query, np_key.transpose())
            d.extend(dp)
            
            # norms too
            if len(q_norm) == 1:
                q_norm.clear()
            if len(k_norm) == 1:
                k_norm.clear()
            sent_q_norms = [np.linalg.norm(q)for q in np_query]
            q_norm.extend(sent_q_norms)
            sent_k_norms = [np.linalg.norm(k)for k in np_key]
            k_norm.extend(sent_k_norms)
            

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# optional if you want to save this dictionary (to load back, use: np.load("attn_dict.pkl.npy", allow_pickle=True).item()
np.save("attn_dict.pkl", attn_dict)

In [8]:
crop_data = False # keep false unless you want to crop data further (e.g., see below)

### optional (crop the data further)

In [20]:
# bert
print(attn_dict["left_text"][5000:5019])

['?', 'turkey', '[SEP]', '[CLS]', 'which', 'famous', 'jewel', '##lers', 'makes', 'the', 'super', 'bowl', 'trophy', '?', 'tiffany', '&', 'co', '.', '[SEP]']


In [25]:
# bert
sentences[226]

'which famous jewellers makes the super bowl trophy? tiffany & co.'

In [26]:
# gpt
print(attn_dict["left_text"][5002:5020])
sentences[223]

['n', 'igel', ' haw', 'th', 'orne', ' was', ' o', 'scar', ' nominated', ' for', ' the', ' madness', ' of', ' which', ' king', '?', ' ge', 'orge']


'nigel hawthorne was oscar nominated for the madness of which king? george'

In [6]:
crop_data = True

# c) generate tokens.json file (1 file w/ info shared across all attn heads)

Format of this file:

```
{
  "tokens": [
    {
      "value": "[cls]",
      "type": "query",
      "pos_int": 0,
      "length": 13,
      "position": 0.0,
      "sentence": "[CLS] synth ##pop band freeze ##pop have used it on stage . [SEP]"
    },
    ...,
    {
      "value": "synth",
      "type": "query",
      "pos_int": 1,
      "length": 13,
      "position": 0.083333,
      "sentence": "[CLS] synth ##pop band freeze ##pop have used it on stage . [SEP]"
    }
  ]
}
```

* **value:** the token
* **type:** query or key
* **pos_int:** position of token in sentence (zero index)
* **length:** length of sentence
* **position:** normalized position in sentence (i.e., pos_int / length - 1)
* **sentence:** full sentence the current token came from

In [9]:
# make folder for data if doesn't already exist
import os

# outer data folder
outer_data = "data/"
if not os.path.exists(outer_data):
    os.mkdir(outer_data)
  
# folder for this model
model_name = "gpt" if "gpt" in model_type else model_type
data_folder = outer_data + model_name + "/"
if not os.path.exists(data_folder):
    os.mkdir(data_folder)
    
data_folder

'data/gpt/'

In [10]:
# get info out of attn_dict
tokens = attn_dict['left_text']
positions = attn_dict['positions']
norm_pos = attn_dict['normalized_positions']
tok_sentences = attn_dict['tokenized_sentences']

In [11]:
token_cutoff = 5020 if crop_data else len(tokens)
token_cutoff

5020

## actually formatting into json

In [13]:
import json

In [14]:
# outer json object
shared_json = {"tokens": []}

# repeat for queries and keys
for tok_type in ["query", "key"]:
    for i in range(token_cutoff):
        # create new dictionary for each token
        new_token = {}
        
        # reformat sentence (gpt output has extra spaces for some reason)
        sent = tok_sentences[i]
        split_sent = sent.split()
        sent_len = len(split_sent);
        sent_format = ' '.join(split_sent)
        
        # fill in info
        new_token["value"] = tokens[i].strip()
        new_token["type"] = tok_type
        new_token["pos_int"] = int(positions[i])
        new_token["length"] = sent_len
        new_token["position"] = round(float(norm_pos[i]), 3)
        new_token["sentence"] = sent_format

        shared_json["tokens"].append(new_token)

# save to json file
json_str = json.dumps(shared_json)
with open(data_folder + "tokens.json", "w") as f:
    f.write(json_str)

# d) attention files (one for each attention head)

Format of these files:
```
{
    "layer": 0, 
     "head": 0, 
     "tokens": [
        {
            "attention": [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        }, 
        ...,
        {
            "attention": [0.772724, 0.227276, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        }
     ]
}
```   
* store **attention weights** for each token

In [39]:
# make attention data subfolder
attn_folder = data_folder + "attention/"

if not os.path.exists(attn_folder):
    os.mkdir(attn_folder)

In [38]:
# get attention info
attention = attn_dict['attn']

## only for gpt (value norm preprocessing -- for sentence attention visualizations)

In [31]:
# initialize dict
value_norms = {}
for head in range(12):
    for layer in range(12):
        value_norms[(head, layer)] = []

In [35]:
# get value norms

# GPT
from transformers import GPT2Model, GPT2Tokenizer
from numpy.linalg import norm

model_type = 'gpt2'
model_version = 'gpt2'
model = GPT2Model.from_pretrained(model_version, return_dict=True, output_attentions=True, output_hidden_states=True, use_cache=True)
tokenizer = GPT2Tokenizer.from_pretrained(model_version, do_lower_case=True)

for sent in sentences:
    inputs = tokenizer(sent, return_tensors="pt")
    outputs = model(**inputs)
    
    key_values = outputs.past_key_values
    num_tokens = len(key_values[0][0][0][0])
   
    for head in range(12):
        for layer in range(12):
            head_norms = []
            for i in range(num_tokens):
                val = key_values[head][1][0][layer][i].detach().numpy()
                val_norm = norm(val)
                head_norms.append(val_norm)
            value_norms[(head, layer)].extend(head_norms)

## now make attention files

In [42]:
# helper function to get attention for keys
def k_matrix(q_matrix):
    # assumes for specific layer + head (e.g., queries[0][0])
    num_tokens = len(q_matrix)
    k_matrix = []
    i = 0
    while i < token_cutoff:
        q = q_matrix[i]
        sent_length = len(q)
        for k_i in range(sent_length):
            k = []
            for q_i in range(sent_length):
                k.append(q_matrix[q_i + i][k_i])
            k_matrix.append(k)
        i += sent_length
    
    return k_matrix

In [43]:
# create json file for each layer/head combo
for layer in range(num_layers): # updating cumulative q/k vectors + attn + dp
    for head in range(num_heads):
        new_json = {}
        new_json["layer"] = layer
        new_json["head"] = head
        new_json["tokens"] = []
        
        # COMMENT OUT BELOW IF USING BERT
        head_val_norms = value_norms[(layer, head)]
        
        for tok_type in ["query", "key"]:
            head_attn = attention[layer][head]
            if tok_type == "key":
                # transpose attn vals if key
                head_attn = k_matrix(head_attn)
                
            for i in range(token_cutoff):
                # iterate through all tokens
                new_token = {}
                attn = head_attn[i]
                # format attention vals first
                attn_format = [round(float(a), 3) for a in attn]
                new_token["attention"] = attn_format
                # COMMENT OUT BELOW IF USING BERT
                new_token["value_norm"] = round(float(head_val_norms[i]), 3)
                
                new_json["tokens"].append(new_token)
            
        json_str = json.dumps(new_json)
        with open(attn_folder + "layer{}_head{}.json".format(layer, head), "w") as f:
            f.write(json_str)

# e) TSNE/UMAP/PCA helper functions

Will be used in next step

In [35]:
# imports
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.decomposition import PCA
import time

In [36]:
# run PCA
def run_pca(data, dim):
    if dim == 3:
        pca = PCA(n_components=3) # 3D
    else:
        pca = PCA(n_components=2) # 2D
    pca_results = pca.fit_transform(data)
    return pca_results

In [37]:
# run TSNE
def run_tsne(data, dim):
    if dim == 3: # 3D
        tsne = TSNE(n_components=3, verbose=0, perplexity=100, n_iter=300, metric="cosine", n_jobs=-1) 
    else: # 2D
        tsne = TSNE(n_components=2, verbose=0, perplexity=100, n_iter=300, metric="cosine", n_jobs=-1)
    tsne_results = tsne.fit_transform(data)
    return tsne_results

In [38]:
# run UMAP
def run_umap(data, dim):
    time_start = time.time()
    if dim == 3: # 3D
        umap = UMAP(n_components=3, init='random', random_state=0, metric='cosine')
    else: # 2D
        umap = UMAP(n_components=2, init='random', random_state=0, metric='cosine')
    umap_results = umap.fit_transform(data)
    return umap_results

# f) byLayerHead files (one for each attention head)

Format of these files:
```
{
    "layer": 0, 
     "head": 0, 
     "tokens": [
         {
             "tsne_x": 6.876875, 
             "tsne_y": 7.453007, 
             "umap_x": -11.346616, 
             "umap_y": 12.795157, 
             "norm": 5.628239, 
             "tsne_x_3d": -8.263657, 
             "tsne_y_3d": 0.388844,
             "tsne_z_3d": -1.241852, 
             "umap_x_3d": 17.736557, 
             "umap_y_3d": -3.88329, 
             "umap_z_3d": 2.701952, 
             "pca_x": -2.444411, 
             "pca_y": -0.763636, 
             "pca_x_3d": -2.444411, 
             "pca_y_3d": -0.763568, 
             "pca_z_3d": 2.882317
         }, 
         ...,
         {
             ...
         }
     ]
}
```   
* includes **2D/3D** **TSNE/UMAP/PCA** coordinates for each token for current layer and head + **norm**

In [39]:
# get info from attn_dict
queries = attn_dict["queries"]
keys = attn_dict["keys"]
q_norms = attn_dict["q_norms"]
k_norms = attn_dict["k_norms"]

In [40]:
# make new folder for byLayerHead files
coord_folder = data_folder + "byLayerHead/"

if not os.path.exists(coord_folder):
    os.mkdir(coord_folder)

## preprocessing

Here, I translate the keys to have same centroid as the queries (for better visualization)

In [41]:
# get means
mean_queries = []
mean_keys = []
for layer in range(num_layers):
    for head in range(num_heads):
        q = queries[layer][head][:token_cutoff]
        k = keys[layer][head][:token_cutoff]
        num_tokens = len(q)
        mean_q = [0] * 64
        mean_k = [0] * 64
        for i in range(num_tokens):
            for j in range(64):
                q_token = q[i]
                k_token = k[i]
                mean_q[j] += q_token[j]
                mean_k[j] += k_token[j]
        
        mean_q = [i / num_tokens for i in mean_q]
        mean_k = [i / num_tokens for i in mean_k]
        mean_queries.append(mean_q)
        mean_keys.append(mean_k)

In [42]:
# fix key means
new_mean_keys = []
for layer in range(12):
    for head in range(12):
        mean_k = [0] * 64
        k = keys[layer][head][:token_cutoff]
        m_k = mean_keys[12 * layer + head]
        q_k = mean_queries[12 * layer + head]
        num_tokens = len(k)
        for i in range(num_tokens):
            k_token = k[i]
            for j in range(64):
                orig_k = k_token[j]
                k_token[j] = orig_k - m_k[j] + q_k[j]  
                mean_k[j] += k_token[j]
        
        mean_k = [i / num_tokens for i in mean_k]
        new_mean_keys.append(mean_k)

### optional: sanity check

In [43]:
print(mean_queries[0])
print(mean_keys[0])

[0.6328304488842647, -0.3406692687978604, 0.19263284422352397, -0.29356196707122, -0.3865076022601104, 0.018325021613456043, -0.5140582495409832, -0.22270956292017494, -0.1028417951616633, -0.10767029772950448, -0.19459884084512544, -0.16332341774470688, -0.31990569537631663, 0.14333667343786632, 0.02661175984106364, -0.20699877709083525, 0.01230896841901618, 0.1584361630165393, -0.42631362929079397, -0.37456994655197, 0.34503850432194444, 0.04248234130167569, -0.009814457149352447, 0.16992524190688973, -0.042799583158596784, -0.5034576060632577, -0.4461871687051992, -0.24721287545224882, 0.21120566658697007, -0.2244664302707995, 0.09846800615699346, -0.014856036968503251, 0.31801266142044704, -0.27427087990187693, -0.6178917641551104, -0.08074602001008997, 0.3457698167524341, -0.19586435282784717, 0.330209734710861, -0.09761096816472857, 0.28747954536340686, -0.4640231773598617, -0.22340672463939276, -0.29028919925505065, 0.18114175643953698, -0.19915520402248504, 0.26762874493917144,

In [44]:
print(new_mean_keys[0])

[0.6328304488842397, -0.34066926879785087, 0.1926328442235162, -0.29356196707124405, -0.3865076022601452, 0.018325021613455575, -0.5140582495410151, -0.22270956292017352, -0.10284179516167383, -0.1076702977295075, -0.1945988408451306, -0.1633234177446997, -0.31990569537629565, 0.1433366734378739, 0.026611759841065195, -0.20699877709084324, 0.012308968419017255, 0.1584361630165289, -0.42631362929077987, -0.3745699465520047, 0.3450385043219235, 0.04248234130167755, -0.009814457149351901, 0.16992524190689354, -0.0427995831585904, -0.5034576060632645, -0.44618716870521064, -0.24721287545224752, 0.21120566658696158, -0.22446643027079363, 0.09846800615698863, -0.01485603696850189, 0.31801266142046836, -0.27427087990189397, -0.617891764155083, -0.0807460200100868, 0.3457698167523939, -0.19586435282783596, 0.33020973471088444, -0.09761096816473555, 0.2874795453633768, -0.4640231773598779, -0.22340672463938815, -0.2902891992550295, 0.18114175643955463, -0.19915520402248896, 0.2676287449391595, 

## get the json files

the code block below will likely take some time to run too (I usually start multiple cluster sessions to finish faster)

In [46]:
# create json file for each layer/head combo 
for layer in range(num_layers): # updating cumulative q/k vectors + attn + dp
    for head in range(num_heads):
        time_start = time.time()
        new_json = {}
        new_json["layer"] = layer
        new_json["head"] = head
        new_json["tokens"] = []
        
        head_queries = queries[layer][head][:token_cutoff]
        head_keys = keys[layer][head][:token_cutoff]
        head_data = head_queries + head_keys
        head_data = np.array(head_data)
        
        head_norms = q_norms[layer][head][:token_cutoff] + k_norms[layer][head][:token_cutoff]
        total_tokens = token_cutoff * 2
        
        # run TSNE/UMAP/PCA
        tsne = run_tsne(head_data, 2)
        tsne_3d = run_tsne(head_data, 3)
        umap = run_umap(head_data, 2)
        umap_3d = run_umap(head_data, 3)
        pca = run_pca(head_data, 2)
        pca_3d = run_pca(head_data, 3)
                
        for i in range(total_tokens):
            # iterate through all tokens
            new_token = {}
            new_token["tsne_x"] = round(float(tsne[i][0]), 3)
            new_token["tsne_y"] = round(float(tsne[i][1]), 3)
            new_token["umap_x"] = round(float(umap[i][0]), 3)
            new_token["umap_y"] = round(float(umap[i][1]), 3)
            new_token["norm"] = round(float(head_norms[i]), 3)
            new_token["tsne_x_3d"] = round(float(tsne_3d[i][0]), 3)
            new_token["tsne_y_3d"] = round(float(tsne_3d[i][1]), 3)
            new_token["tsne_z_3d"] = round(float(tsne_3d[i][2]), 3)
            new_token["umap_x_3d"] = round(float(umap_3d[i][0]), 3)
            new_token["umap_y_3d"] = round(float(umap_3d[i][1]), 3)
            new_token["umap_z_3d"] = round(float(umap_3d[i][2]), 3)
            new_token["pca_x"] = round(float(pca[i][0]), 3)
            new_token["pca_y"] = round(float(pca[i][1]), 3)
            new_token["pca_x_3d"] = round(float(pca_3d[i][0]), 3)
            new_token["pca_y_3d"] = round(float(pca_3d[i][1]), 3)
            new_token["pca_z_3d"] = round(float(pca_3d[i][2]), 3)
            
            new_json["tokens"].append(new_token)
            
        json_str = json.dumps(new_json)
        with open(coord_folder + "layer{}_head{}.json".format(layer, head), "w") as f:
            f.write(json_str)
            
        time_elapsed = time.time() - time_start
        print("Layer {} Head {} complete: {}".format(layer, head, time_elapsed))


Layer 0 Head 0 complete: 306.3622417449951
Layer 0 Head 1 complete: 148.55165147781372
Layer 0 Head 2 complete: 123.72091770172119
Layer 0 Head 3 complete: 122.75478434562683
Layer 0 Head 4 complete: 149.74448490142822
Layer 0 Head 5 complete: 112.99003767967224
Layer 0 Head 6 complete: 106.97444701194763
Layer 0 Head 7 complete: 149.20293641090393
Layer 0 Head 8 complete: 142.5443639755249
Layer 0 Head 9 complete: 109.06266856193542
Layer 0 Head 10 complete: 122.44500541687012
Layer 0 Head 11 complete: 123.2271318435669
Layer 1 Head 0 complete: 110.57423138618469
Layer 1 Head 1 complete: 99.39705801010132
Layer 1 Head 2 complete: 102.20657849311829
Layer 1 Head 3 complete: 106.1483371257782
Layer 1 Head 4 complete: 104.62596726417542
Layer 1 Head 5 complete: 111.96645045280457
Layer 1 Head 6 complete: 123.50146627426147
Layer 1 Head 7 complete: 163.91066360473633
Layer 1 Head 8 complete: 115.58942008018494
Layer 1 Head 9 complete: 107.74760293960571
Layer 1 Head 10 complete: 107.50282

# g) You're done!

download all the data files & place accordingly (inside web/**data**/ folder) to use in our attention visualization tool: https://github.com/catherinesyeh/attention-viz

# optional: shrink data

In [20]:
gpt_folder = "data/gpt/"
bert_folder = "data/bert/"

In [None]:
# token files
for folder in [gpt_folder, bert_folder]:
    token_file = folder + "tokens.json"
    data = json.load(open(token_file, 'r'))
    
    tokens = data['tokens']
    for i in range(len(tokens)):
        t = tokens[i]
        t["position"] = round(float(t["position"]), 3)
        
    json_str = json.dumps(data)
    with open(token_file, "w") as o:
        o.write(json_str)

In [25]:
# attention files
for folder in [gpt_folder, bert_folder]:
    att_folder = folder + "attention/"
    for json_file in sorted(os.listdir(att_folder)):
        if ".json" not in json_file:
            continue
        f = att_folder + json_file
        data = json.load(open(f, 'r'))

        tokens = data['tokens']
        for i in range(len(tokens)):
            t = tokens[i]
            t["attention"] = [round(float(a), 3) for a in t["attention"]]

        json_str = json.dumps(data)
        with open(f, "w") as o:
            o.write(json_str)
            

In [30]:
# byLayerHead files
for folder in [gpt_folder, bert_folder]:
    lh_folder = folder + "byLayerHead/"
    for json_file in sorted(os.listdir(lh_folder)):
        if ".json" not in json_file:
            continue
        f = lh_folder + json_file
        data = json.load(open(f, 'r'))

        tokens = data['tokens']
        for i in range(len(tokens)):
            t = tokens[i]
            for key in t:
                t[key] = round(float(t[key]), 3)
                
        json_str = json.dumps(data)
        with open(f, "w") as o:
            o.write(json_str)