In [29]:
import torch
import torch.nn as nn
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModel
import numpy as np
import pandas as pd
import scienceplots
import matplotlib.pyplot as plt
import torch.nn.functional as F

plt.style.use(['science', 'notebook', 'grid', 'ieee'])

![transformers](transformers.png)

- Decoder 与 encoder相比, 有两个特殊的 attention sublayers
    - masked multi-head (**self**) attention
    - encoder-decoder (**cross**) attention
        - (k, v) from encoder (also be called memory, from last decoder layer)
        - q: decoder input
    - They did not share the same weight

#### Mask attention

In [2]:
model_name = 'bert-base-uncased'
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [3]:
config.vocab_size, config.hidden_size

(30522, 768)

In [4]:
token_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
sample_text = 'time flies like an arrow'
model_inputs = tokenizer(sample_text, return_tensors='pt', add_special_tokens=False)
model_inputs

{'input_ids': tensor([[ 2051, 10029,  2066,  2019,  8612]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [7]:
input_embeddings = token_embedding(model_inputs['input_ids'])
input_embeddings.shape

torch.Size([1, 5, 768])

### Triangular lower matrix

In [13]:
# torch.bmm: matrix multiplication of batch-wise
import math 
q = k = v = input_embeddings
# (1, 5, 768) * (1, 768, 5) => (1, 5, 5)
scores = torch.bmm(q,k.transpose(1,2))/math.sqrt(k.size(-1))
scores, scores.shape

(tensor([[[28.1406, -0.3434, -2.2413,  0.5477, -1.5799],
          [-0.3434, 28.3550, -1.6914, -0.3065,  0.2185],
          [-2.2413, -1.6914, 27.5898,  0.1182, -0.4554],
          [ 0.5477, -0.3065,  0.1182, 27.9868,  0.4677],
          [-1.5799,  0.2185, -0.4554,  0.4677, 28.8011]]],
        grad_fn=<DivBackward0>),
 torch.Size([1, 5, 5]))

In [17]:
seq_len = model_inputs['input_ids'].size(-1)
# triangular lower matrix, all equals to 1
mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0)
mask.shape, mask

(torch.Size([1, 5, 5]),
 tensor([[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]]))

In [18]:
scores.masked_fill(mask == 0, -float('inf'))

tensor([[[28.1406,    -inf,    -inf,    -inf,    -inf],
         [-0.3434, 28.3550,    -inf,    -inf,    -inf],
         [-2.2413, -1.6914, 27.5898,    -inf,    -inf],
         [ 0.5477, -0.3065,  0.1182, 27.9868,    -inf],
         [-1.5799,  0.2185, -0.4554,  0.4677, 28.8011]]],
       grad_fn=<MaskedFillBackward0>)

In [20]:
scores

tensor([[[28.1406, -0.3434, -2.2413,  0.5477, -1.5799],
         [-0.3434, 28.3550, -1.6914, -0.3065,  0.2185],
         [-2.2413, -1.6914, 27.5898,  0.1182, -0.4554],
         [ 0.5477, -0.3065,  0.1182, 27.9868,  0.4677],
         [-1.5799,  0.2185, -0.4554,  0.4677, 28.8011]]],
       grad_fn=<DivBackward0>)

In [23]:
# if we want to in place
scores.masked_fill_(mask == 0, -float('inf'))

tensor([[[28.1406,    -inf,    -inf,    -inf,    -inf],
         [-0.3434, 28.3550,    -inf,    -inf,    -inf],
         [-2.2413, -1.6914, 27.5898,    -inf,    -inf],
         [ 0.5477, -0.3065,  0.1182, 27.9868,    -inf],
         [-1.5799,  0.2185, -0.4554,  0.4677, 28.8011]]],
       grad_fn=<MaskedFillBackward0>)

In [24]:
scores

tensor([[[28.1406,    -inf,    -inf,    -inf,    -inf],
         [-0.3434, 28.3550,    -inf,    -inf,    -inf],
         [-2.2413, -1.6914, 27.5898,    -inf,    -inf],
         [ 0.5477, -0.3065,  0.1182, 27.9868,    -inf],
         [-1.5799,  0.2185, -0.4554,  0.4677, 28.8011]]],
       grad_fn=<MaskedFillBackward0>)

$$\exp(-\infty) = 0$$

This is for later $softmax (\cdot)$ operation

In [25]:
torch.exp(torch.tensor(-float('inf')))

tensor(0.)

#### Masked self attention

In [30]:
def scaled_dot_attn(q, k, v, mask=None):
    dim_k = k.size(-1)
    scores = torch.bmm(q, k.transpose(1, 2)/math.sqrt(dim_k))
    if mask is not None:
        scores.masked_fill_(mask==0, -float('inf'))
    attn_weights = F.softmax(scores, dim=-1)
    print(attn_weights, attn_weights.shape)
    return torch.bmm(attn_weights, v)

In [31]:
scaled_dot_attn(q, k, v, mask)

tensor([[[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [3.4392e-13, 1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [1.1079e-13, 1.9200e-13, 1.0000e+00, 0.0000e+00, 0.0000e+00],
         [1.2115e-12, 5.1565e-13, 7.8852e-13, 1.0000e+00, 0.0000e+00],
         [6.3932e-14, 3.8617e-13, 1.9682e-13, 4.9541e-13, 1.0000e+00]]],
       grad_fn=<SoftmaxBackward0>) torch.Size([1, 5, 5])


tensor([[[ 0.2933, -0.3704,  0.2531,  ...,  0.6531,  0.5391,  1.1173],
         [ 0.2266, -0.5755, -1.2444,  ...,  0.9320,  0.8994,  0.9623],
         [ 0.1204, -1.8380,  1.2742,  ...,  0.4473, -2.7296, -0.7691],
         [ 0.6016, -0.0558, -0.6753,  ..., -0.8899, -0.1966,  0.2929],
         [-1.4113,  0.5819,  1.2363,  ..., -0.3869, -1.4703, -2.7841]]],
       grad_fn=<BmmBackward0>)