# Transformer from Scratch



In [1]:
import torch
import torch.nn as nn
import numpy as np

In [2]:
import logging
logger = logging.getLogger("tensor_shapes")
handler = logging.StreamHandler()
formatter = logging.Formatter(
        '%(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(1)

In [29]:
import inspect
def getclass():
    stack = inspect.stack()
    return stack[3][0].f_locals["self"].__class__

# A helper function to check how tensor sizes change
def log_size(tsr: torch.Tensor, name: str):
    cls = getclass()
    logger.log(level=cls.level, msg=[{cls.__name__}, {name}, {tsr.shape}])

In [30]:
from enum import IntEnum
# Control how much debugging output we want
class TensorLoggingLevels(IntEnum):
    attention = 1
    attention_head = 2
    multihead_attention_block = 3
    enc_dec_block = 4
    enc_dec = 5

In [31]:
class Dim(IntEnum):
    batch = 0
    seq = 1
    feature = 2

# Components


### Scaled dot product attention

$$ \textrm{Attention}(Q, K, V) = \textrm{softmax}(\frac{QK^T}{\sqrt{d_k}})V $$

In [32]:
import math 

class ScaledDotProductAttention(nn.Module):
    level = TensorLoggingLevels.attention
    def __init__(self, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, q, k, v, mask=None):
        d_k = k.size(-1)
        assert q.size(-1) == d_k
        
        # Compute the dot product between queries and keys for each batch and position in the sequence
        attn = torch.bmm(q, k.transpose(Dim.seq, Dim.feature))
        
        attn = attn / math.sqrt(d_k)
        
        attn = torch.exp(attn)
        
        log_size(attn, "attention weight") # Batch, Seq, Seq
        
        if mask is not None:
            attn = attn.masked_fill(mask, 0)
        attn = attn / attn.sum(dim=-1, keepdim=True)
        attn = self.dropout(attn)
        output = torch.bmm(attn, v) # (Batch, Seq, Feature)
        log_size(output, "attention output size") # (Batch, Seq, Seq)
        return output

In [33]:
attn = ScaledDotProductAttention()

In [34]:
q = torch.rand(5, 10, 20)
k = torch.rand(5, 10, 20)
v = torch.rand(5, 10, 20)

In [35]:
attn(q, k, v)

[{'ScaledDotProductAttention'}, {'attention weight'}, {torch.Size([5, 10, 10])}]
[{'ScaledDotProductAttention'}, {'attention output size'}, {torch.Size([5, 10, 20])}]


tensor([[[0.3649, 0.3846, 0.4327, 0.4742, 0.4355, 0.5667, 0.3473, 0.6755,
          0.3682, 0.3680, 0.4695, 0.4058, 0.6266, 0.2510, 0.3948, 0.3747,
          0.5287, 0.3257, 0.4595, 0.5059],
         [0.5351, 0.4985, 0.4842, 0.4417, 0.4807, 0.6019, 0.4135, 0.7394,
          0.3959, 0.5468, 0.5392, 0.4992, 0.6482, 0.3249, 0.5005, 0.4512,
          0.6011, 0.3977, 0.4671, 0.5979],
         [0.5860, 0.4891, 0.5454, 0.5410, 0.5302, 0.7143, 0.4550, 0.8638,
          0.5148, 0.5719, 0.5614, 0.5711, 0.7120, 0.4048, 0.5165, 0.4857,
          0.7224, 0.4134, 0.5618, 0.6230],
         [0.3939, 0.3823, 0.3936, 0.4944, 0.4758, 0.5268, 0.2933, 0.6908,
          0.3956, 0.3721, 0.3852, 0.4262, 0.5831, 0.2916, 0.4326, 0.4049,
          0.5467, 0.3257, 0.4500, 0.5605],
         [0.5742, 0.4893, 0.5407, 0.5308, 0.5406, 0.7119, 0.4438, 0.8632,
          0.5032, 0.5491, 0.5844, 0.5421, 0.7282, 0.4040, 0.5288, 0.4995,
          0.7226, 0.4289, 0.5676, 0.6335],
         [0.5715, 0.5084, 0.5340, 0.5305, 0.5

### Multi Head Attention

In [37]:
class AttentionHead(nn.Module):
    level = TensorLoggingLevels.attention_head
    def __init__(self, d_model, d_feature, dropout = 0.1):
        super().__init__()
        # We assume that the queries, keys, features all have the same feature size.
        self.attn = ScaledDotProductAttention(dropout)
        self.query_tfm = nn.Linear(d_model, d_feature)
        self.key_tfm = nn.Linear(d_model, d_feature)
        self.value_tfm = nn.Linear(d_model, d_feature)
        
    def forward(self, queries, keys, values, mask=None):
        Q = self.query_tfm(queries)
        K = self.key_tfm(keys)
        V = self.value_tfm(values)
        log_size(Q, "queries, keys, vals")
        
        x = self.attn(Q, K, V)
        return x

In [38]:
attn_head = AttentionHead(20, 20)
attn_head(q, k, v)

[{'AttentionHead'}, {'queries, keys, vals'}, {torch.Size([5, 10, 20])}]
[{'ScaledDotProductAttention'}, {'attention weight'}, {torch.Size([5, 10, 10])}]
[{'ScaledDotProductAttention'}, {'attention output size'}, {torch.Size([5, 10, 20])}]


tensor([[[ 2.5443e-02, -3.1271e-03,  2.5496e-01,  2.7784e-01,  3.4062e-01,
          -3.7640e-01, -1.1357e-01,  3.7572e-01,  2.5182e-01,  1.0895e-01,
          -1.1351e-02,  5.9561e-02, -3.5149e-01,  1.4703e-01, -1.6218e-01,
          -1.1671e-01, -6.2780e-01, -4.7114e-01, -1.9149e-01, -2.4516e-01],
         [ 4.4107e-02,  9.5427e-03,  2.0750e-01,  2.3518e-01,  2.3767e-01,
          -2.7780e-01, -7.3458e-02,  2.9576e-01,  1.9902e-01,  8.6987e-02,
          -3.5164e-02,  5.2046e-02, -2.4403e-01,  9.7605e-02, -7.9549e-02,
          -6.6505e-02, -4.8039e-01, -3.8352e-01, -1.3561e-01, -1.8427e-01],
         [ 2.4462e-02,  3.5541e-02,  2.3646e-01,  2.6081e-01,  2.8128e-01,
          -3.2538e-01, -7.1484e-02,  3.3687e-01,  2.2953e-01,  9.6201e-02,
          -2.6813e-02,  6.9793e-02, -2.9232e-01,  1.2726e-01, -1.1854e-01,
          -9.2911e-02, -5.6253e-01, -4.5004e-01, -1.4179e-01, -2.1471e-01],
         [ 3.9473e-02, -4.8544e-03,  2.1772e-01,  2.8330e-01,  3.2407e-01,
          -3.3086e-01,

The multi head attention block applies multiple attention heads as can be seen in the paper "Attention is all you need", then concatenates the output and applies single linear projection.

In [39]:
logger.setLevel(TensorLoggingLevels.attention_head)

In [None]:
class MultiHeadAttention(nn.Module):
    level = TensorLoggingLevels.multihead_attention_block
    def __init__(self, d_model, d_feature, n_heads, dropout=0.1):
        