In [5]:
from transformers import AutoTokenizer, AutoConfig
from bertviz.transformers_neuron_view import BertModel
from bertviz.neuron_view import show

In [6]:
model_checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

text = "It was a bright day in April, and the clocks were striking thirteen."
model = BertModel.from_pretrained(model_checkpoint)
show(model, 'bert', tokenizer, text, display_mode= 'light', layer=0, head=8)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
input = tokenizer(text, return_tensors='pt', add_special_tokens=True)
input.input_ids

tensor([[  101,  2009,  2001,  1037,  4408,  2154,  1999,  2258,  1010,  1998,
          1996, 20940,  2020,  8478,  7093,  1012,   102]])

In [8]:
config = AutoConfig.from_pretrained(model_checkpoint)
config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [12]:
import torch
import torch.nn as nn
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
token_emb

Embedding(30522, 768)

In [13]:
input_emb = token_emb(input.input_ids)
print(input_emb)
input_emb.size()

tensor([[[-0.9459,  0.6278,  0.0859,  ...,  0.3358,  0.6219, -0.0131],
         [-0.8191, -0.3052, -0.7903,  ..., -1.1841, -2.0757,  0.6198],
         [ 0.7261, -1.6428, -1.0418,  ...,  0.5760, -0.7617, -1.3585],
         ...,
         [ 0.6143, -0.4783,  0.1122,  ..., -1.3098,  0.6991,  0.9014],
         [ 0.1792, -0.1313,  0.1259,  ...,  0.1796,  0.2055,  0.7784],
         [-2.0752, -0.4088, -0.1705,  ..., -1.4095, -1.0688,  0.3631]]],
       grad_fn=<EmbeddingBackward0>)


torch.Size([1, 17, 768])

In [17]:


q = k = v = input_emb 
C = k.size(-1) # (B, T, C) - Get C dimension
scores = torch.bmm(q, k.transpose(-2, -1) * C ** -0.5)
scores.size()


torch.Size([1, 17, 768])

In [19]:
import torch.nn.functional as F

wei = F.softmax(scores, dim=1)
wei.sum(dim=-1)
wei

tensor([[[1.0000e+00, 2.6356e-12, 9.5922e-13, 1.0762e-12, 9.0711e-13,
          6.1447e-13, 5.1494e-14, 1.0251e-11, 6.6931e-13, 8.1382e-13,
          3.7568e-12, 1.4301e-13, 2.5912e-12, 2.4854e-12, 1.7123e-13,
          2.1507e-12, 3.3900e-13],
         [2.3727e-12, 1.0000e+00, 8.8219e-12, 5.1248e-13, 5.6423e-12,
          7.9093e-14, 5.0859e-13, 2.8068e-11, 6.1158e-13, 5.1874e-13,
          7.4174e-12, 1.3052e-12, 2.9711e-12, 1.1920e-12, 2.8326e-13,
          1.7420e-12, 8.9375e-13],
         [3.0753e-13, 3.1417e-12, 1.0000e+00, 1.1043e-12, 1.6654e-12,
          7.2105e-13, 4.1869e-13, 1.7795e-11, 2.6936e-13, 9.4876e-13,
          4.4178e-12, 2.1883e-13, 4.9394e-11, 9.1057e-12, 4.7360e-14,
          6.3016e-13, 9.0321e-13],
         [5.1940e-13, 2.7473e-13, 1.6624e-12, 1.0000e+00, 4.3392e-12,
          2.2688e-12, 2.9791e-13, 3.9915e-12, 3.7991e-13, 1.5966e-13,
          1.4246e-11, 1.0020e-12, 2.8304e-12, 2.0016e-11, 1.8846e-13,
          7.3744e-13, 4.2976e-13],
         [2.0603e-13