# Building Transformer from Scratch



The code is based off of the following repos/blog posts:

- [attention-is-all-you-need-pytorch](https://github.com/jadore801120/attention-is-all-you-need-pytorch)
- [pytorch-pretrained-BERT](https://github.com/huggingface/pytorch-pretrained-BERT)
- [The Annotated Transformer](http://nlp.seas.harvard.edu/2018/04/03/attention.html) 

Thanks so much to their authors!

In [64]:
import torch
import torch.nn as nn
import numpy as np

In [65]:
import logging
logger = logging.getLogger("tensor_shapes")
handler = logging.StreamHandler()
formatter = logging.Formatter(
        '%(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(1)

In [66]:
import inspect
def getclass():
    stack = inspect.stack()
    return stack[3][0].f_locals["self"].__class__

# A helper function to check how tensor sizes change
def log_size(tsr: torch.Tensor, name: str):
    cls = getclass()
    logger.log(level=cls.level, msg=[{cls.__name__}, {name}, {tsr.shape}])

In [67]:
from enum import IntEnum
# Control how much debugging output we want
class TensorLoggingLevels(IntEnum):
    attention = 1
    attention_head = 2
    multihead_attention_block = 3
    enc_dec_block = 4
    enc_dec = 5

In [68]:
class Dim(IntEnum):
    batch = 0
    seq = 1
    feature = 2

# Components


### Scaled dot product attention

$$ \textrm{Attention}(Q, K, V) = \textrm{softmax}(\frac{QK^T}{\sqrt{d_k}})V $$

In [69]:
import math 

class ScaledDotProductAttention(nn.Module):
    level = TensorLoggingLevels.attention
    def __init__(self, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, q, k, v, mask=None):
        d_k = k.size(-1)
        assert q.size(-1) == d_k
        
        # Compute the dot product between queries and keys for each batch and position in the sequence
        attn = torch.bmm(q, k.transpose(Dim.seq, Dim.feature))
        
        attn = attn / math.sqrt(d_k)
        
        attn = torch.exp(attn)
        
        log_size(attn, "attention weight") # Batch, Seq, Seq
        
        if mask is not None:
            attn = attn.masked_fill(mask, 0)
        attn = attn / attn.sum(dim=-1, keepdim=True)
        attn = self.dropout(attn)
        output = torch.bmm(attn, v) # (Batch, Seq, Feature)
        log_size(output, "attention output size") # (Batch, Seq, Seq)
        return output

In [70]:
attn = ScaledDotProductAttention()

In [71]:
q = torch.rand(5, 10, 20)
k = torch.rand(5, 10, 20)
v = torch.rand(5, 10, 20)

In [72]:
attn(q, k, v)

[{'ScaledDotProductAttention'}, {'attention weight'}, {torch.Size([5, 10, 10])}]
[{'ScaledDotProductAttention'}, {'attention weight'}, {torch.Size([5, 10, 10])}]
[{'ScaledDotProductAttention'}, {'attention output size'}, {torch.Size([5, 10, 20])}]
[{'ScaledDotProductAttention'}, {'attention output size'}, {torch.Size([5, 10, 20])}]


tensor([[[0.5747, 0.5588, 0.5604, 0.5413, 0.4001, 0.5414, 0.5919, 0.5281,
          0.4878, 0.4383, 0.4066, 0.5405, 0.4926, 0.5728, 0.4668, 0.4833,
          0.3724, 0.3557, 0.5482, 0.5897],
         [0.5776, 0.5657, 0.5311, 0.5910, 0.4681, 0.5854, 0.5741, 0.5346,
          0.6200, 0.4737, 0.3521, 0.5186, 0.4846, 0.5690, 0.4792, 0.5204,
          0.3871, 0.4188, 0.5351, 0.5517],
         [0.5668, 0.5620, 0.5910, 0.5604, 0.4933, 0.5882, 0.5743, 0.5497,
          0.5484, 0.5404, 0.4237, 0.5093, 0.5146, 0.6038, 0.4010, 0.5421,
          0.3051, 0.4411, 0.5341, 0.5842],
         [0.6067, 0.6504, 0.6093, 0.5938, 0.5001, 0.6017, 0.6295, 0.5948,
          0.6072, 0.5263, 0.4161, 0.5852, 0.5826, 0.6212, 0.4943, 0.5474,
          0.4130, 0.4358, 0.5729, 0.5924],
         [0.5021, 0.5467, 0.4901, 0.5166, 0.4331, 0.5713, 0.5534, 0.5780,
          0.5203, 0.5151, 0.4063, 0.5148, 0.5387, 0.5561, 0.4459, 0.5483,
          0.3647, 0.4241, 0.5175, 0.4893],
         [0.5491, 0.4776, 0.5300, 0.5573, 0.4

### Multi Head Attention

In [73]:
class AttentionHead(nn.Module):
    level = TensorLoggingLevels.attention_head
    def __init__(self, d_model, d_feature, dropout = 0.1):
        super().__init__()
        # We assume that the queries, keys, features all have the same feature size.
        self.attn = ScaledDotProductAttention(dropout)
        self.query_tfm = nn.Linear(d_model, d_feature)
        self.key_tfm = nn.Linear(d_model, d_feature)
        self.value_tfm = nn.Linear(d_model, d_feature)
        
    def forward(self, queries, keys, values, mask=None):
        Q = self.query_tfm(queries)
        K = self.key_tfm(keys)
        V = self.value_tfm(values)
        log_size(Q, "queries, keys, vals")
        
        x = self.attn(Q, K, V)
        return x

In [74]:
attn_head = AttentionHead(20, 20)
attn_head(q, k, v)

[{'AttentionHead'}, {'queries, keys, vals'}, {torch.Size([5, 10, 20])}]
[{'AttentionHead'}, {'queries, keys, vals'}, {torch.Size([5, 10, 20])}]
[{'ScaledDotProductAttention'}, {'attention weight'}, {torch.Size([5, 10, 10])}]
[{'ScaledDotProductAttention'}, {'attention weight'}, {torch.Size([5, 10, 10])}]
[{'ScaledDotProductAttention'}, {'attention output size'}, {torch.Size([5, 10, 20])}]
[{'ScaledDotProductAttention'}, {'attention output size'}, {torch.Size([5, 10, 20])}]


tensor([[[ 0.3332,  0.1562, -0.2520, -0.5439,  0.7122,  0.5628,  0.1360,
          -0.4152,  0.4826, -0.1513,  0.1452, -0.4091, -0.6543, -0.0346,
           0.1930, -0.2207,  0.0200,  0.1493, -0.6084, -0.5273],
         [ 0.3476,  0.1558, -0.2437, -0.5314,  0.6916,  0.6086,  0.1444,
          -0.4102,  0.5066, -0.1382,  0.1524, -0.4225, -0.6639, -0.0427,
           0.1914, -0.1967, -0.0146,  0.1179, -0.6035, -0.5093],
         [ 0.2885,  0.1691, -0.2391, -0.5583,  0.7181,  0.5551,  0.1639,
          -0.4489,  0.4768, -0.1536,  0.1537, -0.4600, -0.6541, -0.0337,
           0.1846, -0.2111,  0.0392,  0.1679, -0.6096, -0.5034],
         [ 0.3480,  0.1311, -0.2577, -0.5335,  0.6823,  0.5617,  0.1261,
          -0.4567,  0.4900, -0.1234,  0.1167, -0.4095, -0.6620, -0.0324,
           0.1794, -0.2123,  0.0020,  0.1590, -0.5870, -0.4732],
         [ 0.2952,  0.1616, -0.1887, -0.4129,  0.4859,  0.4585,  0.0795,
          -0.3534,  0.4185, -0.1273,  0.0311, -0.3608, -0.5353, -0.0040,
          

The multi head attention block applies multiple attention heads as can be seen in the paper "Attention is all you need", then concatenates the output and applies single linear projection.

In [75]:
logger.setLevel(TensorLoggingLevels.attention_head)

In [76]:
class MultiHeadAttention(nn.Module):
    level = TensorLoggingLevels.multihead_attention_block
    def __init__(self, d_model, d_feature, n_heads, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.d_feature = d_feature
        self.n_heads = n_heads
        
        assert d_model == d_feature * n_heads
        
        self.attn_heads = nn.ModuleList([
            AttentionHead(d_model, d_feature, dropout) for _ in range(n_heads)
        ])
        self.projection = nn.Linear(d_feature * n_heads, d_model)
        
    def forward(self, queries, keys, values, mask=None):
        log_size(queries, "Input queries")
        x = [attn(queries, keys, values, mask=mask) # (Batch, Seq, Feature)
             for i, attn in enumerate(self.attn_heads)]
        log_size(x[0], "Output of single head")
        
        #reconcatenate
        x = torch.cat(x, dim=Dim.feature) # (Batch, Sequence, D_Feature * n_heads)
        log_size(x, "Concatenated output") 
        x = self.projection(x) # (Batch, Sequence, D_model)
        log_size(x, "projected output")
        return x

In [77]:
heads = MultiHeadAttention(20 * 8, 20, 8)
heads(q.repeat(1, 1, 8), 
      k.repeat(1, 1, 8), 
      v.repeat(1, 1, 8))

[{'MultiHeadAttention'}, {'Input queries'}, {torch.Size([5, 10, 160])}]
[{'MultiHeadAttention'}, {'Input queries'}, {torch.Size([5, 10, 160])}]
[{'AttentionHead'}, {'queries, keys, vals'}, {torch.Size([5, 10, 20])}]
[{'AttentionHead'}, {'queries, keys, vals'}, {torch.Size([5, 10, 20])}]
[{'AttentionHead'}, {'queries, keys, vals'}, {torch.Size([5, 10, 20])}]
[{'AttentionHead'}, {'queries, keys, vals'}, {torch.Size([5, 10, 20])}]
[{'AttentionHead'}, {'queries, keys, vals'}, {torch.Size([5, 10, 20])}]
[{'AttentionHead'}, {'queries, keys, vals'}, {torch.Size([5, 10, 20])}]
[{'AttentionHead'}, {'queries, keys, vals'}, {torch.Size([5, 10, 20])}]
[{'AttentionHead'}, {'queries, keys, vals'}, {torch.Size([5, 10, 20])}]
[{'AttentionHead'}, {'queries, keys, vals'}, {torch.Size([5, 10, 20])}]
[{'AttentionHead'}, {'queries, keys, vals'}, {torch.Size([5, 10, 20])}]
[{'AttentionHead'}, {'queries, keys, vals'}, {torch.Size([5, 10, 20])}]
[{'AttentionHead'}, {'queries, keys, vals'}, {torch.Size([5, 10,

tensor([[[-0.3098,  0.0632, -0.0779,  ...,  0.1630,  0.1845,  0.0035],
         [-0.2915,  0.0624, -0.1241,  ...,  0.1605,  0.2087,  0.0033],
         [-0.3313,  0.0462, -0.1258,  ...,  0.1711,  0.2242,  0.0394],
         ...,
         [-0.3081,  0.0769, -0.1357,  ...,  0.1171,  0.2425,  0.0631],
         [-0.3345,  0.0206, -0.1098,  ...,  0.1247,  0.2087,  0.0542],
         [-0.3330,  0.0090, -0.1419,  ...,  0.1360,  0.2619,  0.0388]],

        [[-0.2791,  0.0551, -0.1730,  ...,  0.1422,  0.2262,  0.0197],
         [-0.2610,  0.0868, -0.1833,  ...,  0.1543,  0.1849,  0.0168],
         [-0.2864,  0.1029, -0.1754,  ...,  0.1311,  0.2076,  0.0212],
         ...,
         [-0.2929,  0.1188, -0.1651,  ...,  0.1700,  0.2072,  0.0022],
         [-0.2418,  0.0806, -0.1319,  ...,  0.1424,  0.1861, -0.0084],
         [-0.2603,  0.0978, -0.1801,  ...,  0.1722,  0.2179,  0.0137]],

        [[-0.2872,  0.1403, -0.1051,  ...,  0.1184,  0.1804,  0.0495],
         [-0.3351,  0.1149, -0.0621,  ...,  0

### The Encoder

The encoder is made up of the following components:
- multi-head attention block
- simple feedforward neural network

These components are connected using residual connections and layer normalization.

In [78]:
logger.setLevel(TensorLoggingLevels.multihead_attention_block)

In [79]:
class LayerNorm(nn.Module):
    def __init__(self, d_model, eps =1e-8):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps
        
    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta

In [80]:
class EncoderBlock(nn.Module):
    level = TensorLoggingLevels.enc_dec_block
    def __init__(self, d_model=512, d_feature=64, d_ff=2048, n_heads=8, dropout=0.1):
        super().__init__()
        self.attn_head = MultiHeadAttention(d_model, d_feature, n_heads, dropout)
        self.layer_norml = LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.position_wise_feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.layer_norm2 = LayerNorm(d_model)
        
    def forward(self, x, mask=None):
        log_size(x, "Encoder block input")
        attn = self.attn_head(x, x, x, mask=mask)
        log_size(x, "Attention Output")
        # Applying normalization and residual connection.
        x = x + self.dropout(self.layer_norml(attn))
        # Applying position-wise feedforward network
        pos = self.position_wise_feed_forward(x)
        log_size(x, "Feedforward output")
        # Applying normalization and residual connection
        x = x + self.dropout(self.layer_norm2(pos))
        log_size(x, "Encoder size output")
        return x

In [81]:
enc = EncoderBlock()

In [82]:
enc(torch.rand(5, 10, 512))

[{'EncoderBlock'}, {'Encoder block input'}, {torch.Size([5, 10, 512])}]
[{'EncoderBlock'}, {'Encoder block input'}, {torch.Size([5, 10, 512])}]
[{'MultiHeadAttention'}, {'Input queries'}, {torch.Size([5, 10, 512])}]
[{'MultiHeadAttention'}, {'Input queries'}, {torch.Size([5, 10, 512])}]
[{'MultiHeadAttention'}, {'Output of single head'}, {torch.Size([5, 10, 64])}]
[{'MultiHeadAttention'}, {'Output of single head'}, {torch.Size([5, 10, 64])}]
[{'MultiHeadAttention'}, {'Concatenated output'}, {torch.Size([5, 10, 512])}]
[{'MultiHeadAttention'}, {'Concatenated output'}, {torch.Size([5, 10, 512])}]
[{'MultiHeadAttention'}, {'projected output'}, {torch.Size([5, 10, 512])}]
[{'MultiHeadAttention'}, {'projected output'}, {torch.Size([5, 10, 512])}]
[{'EncoderBlock'}, {'Attention Output'}, {torch.Size([5, 10, 512])}]
[{'EncoderBlock'}, {'Attention Output'}, {torch.Size([5, 10, 512])}]
[{'EncoderBlock'}, {'Feedforward output'}, {torch.Size([5, 10, 512])}]
[{'EncoderBlock'}, {'Feedforward output

tensor([[[ 0.2019, -0.8521, -1.5490,  ...,  1.8720,  0.6059,  0.6887],
         [-0.6209, -1.5672, -2.2606,  ...,  2.2722,  1.7333, -0.0730],
         [-0.8003, -1.9977, -2.1881,  ...,  2.0279,  0.5202, -0.3955],
         ...,
         [-0.8683, -0.7518, -1.7853,  ...,  1.6978,  1.4379,  0.6216],
         [ 0.0102, -0.6689, -1.9829,  ...,  1.8591,  0.2776,  0.8346],
         [-0.6425, -1.0068, -1.6025,  ...,  1.7708,  1.6560,  0.0441]],

        [[-1.3198, -0.1476, -2.2539,  ...,  2.1894,  0.2406, -0.1671],
         [-0.9608, -0.5074, -0.6843,  ...,  2.7851,  1.3299, -1.2358],
         [-0.3217, -1.9347, -1.2695,  ...,  1.6101,  0.9010, -1.2702],
         ...,
         [-1.2105, -0.4053, -2.1174,  ...,  1.4413,  0.7410, -0.1657],
         [-1.3018, -0.4798, -0.8744,  ...,  2.6181,  0.7699, -0.8672],
         [-1.2169, -0.8006,  0.2652,  ...,  2.3291,  0.3244, -0.8648]],

        [[ 0.7516, -1.5048, -1.0644,  ...,  1.6249,  0.9817,  0.0616],
         [-0.6448, -1.8260,  0.6772,  ...,  1

The Encoder is having six consecutive encoder blocks, thus:

In [83]:
class TransformerEncoder(nn.Module):
    level = TensorLoggingLevels.enc_dec
    def __init__(self, n_blocks = 6, d_model = 512, n_heads = 8, d_ff = 2048, dropout = 0.1):
        super().__init__()
        self.encoders = nn.ModuleList([EncoderBlock(d_model = d_model, d_feature = d_model // n_heads, 
                                                   d_ff = d_ff, dropout = dropout)
                                       for _ in range(n_blocks)
                                      ])
        
        def forward(self, x: torch.FloatTensor, mask = None):
            for encoder in self.encoders:
                x = encoder(x)
            return(x)

### The Decoder

The decoder is same in structure as the encoder with just one additional multi-head attention block that takes the target sentence as input.

In [84]:
class DecoderBlock(nn.Module):
    level = TensorLoggingLevels.enc_dec_block
    def __init__(self, d_model=512, d_feature=64, d_ff=2048, n_heads=8, dropout=0.1):
        super().__init__()
        self.masked_attn_head = MultiHeadAttention(d_model, d_feature, n_heads, dropout)
        self.attn_head = MultiHeadAttention(d_model, d_feature, n_heads, dropout)
        self.position_wise_feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model),
        )

        self.layer_norm1 = LayerNorm(d_model)
        self.layer_norm2 = LayerNorm(d_model)
        self.layer_norm3 = LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_out, 
                src_mask=None, tgt_mask=None):
        # Applying attention to inputs
        att = self.masked_attn_head(x, x, x, mask=src_mask)
        x = x + self.dropout(self.layer_norm1(att))
        # Applying attention to the encoder outputs and outputs of the previous layer
        att = self.attn_head(queries=x, keys=enc_out, values=enc_out, mask=tgt_mask)
        x = x + self.dropout(self.layer_norm2(att))
        # Applying position-wise feedforward network
        pos = self.position_wise_feed_forward(x)
        x = x + self.dropout(self.layer_norm2(pos))
        return x

In [85]:
dec = DecoderBlock()
dec(torch.rand(5, 10, 512), enc(torch.rand(5, 10, 512)))

[{'EncoderBlock'}, {'Encoder block input'}, {torch.Size([5, 10, 512])}]
[{'EncoderBlock'}, {'Encoder block input'}, {torch.Size([5, 10, 512])}]
[{'MultiHeadAttention'}, {'Input queries'}, {torch.Size([5, 10, 512])}]
[{'MultiHeadAttention'}, {'Input queries'}, {torch.Size([5, 10, 512])}]
[{'MultiHeadAttention'}, {'Output of single head'}, {torch.Size([5, 10, 64])}]
[{'MultiHeadAttention'}, {'Output of single head'}, {torch.Size([5, 10, 64])}]
[{'MultiHeadAttention'}, {'Concatenated output'}, {torch.Size([5, 10, 512])}]
[{'MultiHeadAttention'}, {'Concatenated output'}, {torch.Size([5, 10, 512])}]
[{'MultiHeadAttention'}, {'projected output'}, {torch.Size([5, 10, 512])}]
[{'MultiHeadAttention'}, {'projected output'}, {torch.Size([5, 10, 512])}]
[{'EncoderBlock'}, {'Attention Output'}, {torch.Size([5, 10, 512])}]
[{'EncoderBlock'}, {'Attention Output'}, {torch.Size([5, 10, 512])}]
[{'EncoderBlock'}, {'Feedforward output'}, {torch.Size([5, 10, 512])}]
[{'EncoderBlock'}, {'Feedforward output

tensor([[[-2.0071, -1.2843, -4.1290,  ..., -1.8316, -2.1579,  0.5785],
         [-0.2845, -0.8257, -1.4583,  ..., -0.8472, -3.2208,  1.6757],
         [-0.6898, -1.1150, -2.3871,  ..., -0.0223, -3.1017,  1.3372],
         ...,
         [-0.5948,  1.2356, -2.6924,  ..., -1.5750, -0.8074,  1.7195],
         [-1.3559, -0.3809, -2.2441,  ..., -2.3942, -2.6711,  1.2597],
         [-1.9006, -0.5104, -3.2280,  ...,  0.4660, -2.3734,  1.1669]],

        [[-0.0665, -0.7638, -2.8435,  ..., -2.0267, -1.3121,  0.7735],
         [-1.1719, -0.7845, -0.2363,  ..., -2.7859, -1.4313,  1.4471],
         [-0.7660, -0.3501, -2.9673,  ..., -2.6321, -1.2241,  3.3386],
         ...,
         [-1.9065, -1.7367, -3.0015,  ..., -1.7995, -2.0151,  0.3647],
         [-0.6908, -1.4623, -3.2172,  ..., -2.3560, -1.2471,  1.8071],
         [-0.4559, -0.2864, -2.9192,  ..., -1.9094, -1.3392,  1.9209]],

        [[-1.6635, -1.1010, -2.0872,  ..., -1.6636, -0.5840,  1.8209],
         [-1.7915, -1.8656, -2.6164,  ...,  0

In [86]:
class TransformerDecoder(nn.Module):
    level = TensorLoggingLevels.enc_dec
    def __init__(self, n_blocks=6, d_model=512, d_feature=64,d_ff=2048, n_heads=8, dropout=0.1):
        super().__init__()
        self.position_embedding = PositionalEmbedding(d_model)
        self.decoders = nn.ModuleList([
            DecoderBlock(d_model=d_model, d_feature=d_model // n_heads, d_ff=d_ff, dropout=dropout)
            for _ in range(n_blocks)
        ])
        
    def forward(self, x: torch.FloatTensor, enc_out: torch.FloatTensor, src_mask=None, tgt_mask=None):
        for decoder in self.decoders:
            x = decoder(x, enc_out, src_mask=src_mask, tgt_mask=tgt_mask)
        return x

### Positional Embeddings

Attention blocks don't have any notion of word order in a sentence. The Transformer explicitly adds the positional information via the positional embeddings.

In [114]:
class PositionalEmbedding(nn.Module):
    level = 1
    def __init__(self, d_model, max_len=512):
        super().__init__()        

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.weight = nn.Parameter(pe, requires_grad=False)
        
    def forward(self, x):
        return self.weight[:, :x.size(1), :] # (1, Seq, Feature)

In [115]:
class WordPositionEmbedding(nn.Module):
    level = 1
    def __init__(self, vocab_size, d_model=512):
        super().__init__()
        self.word_embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = PositionalEmbedding(d_model)
        
    def forward(self, x: torch.LongTensor, mask=None) -> torch.FloatTensor:
        return self.word_embedding(x) + self.position_embedding(x)

In [119]:
emb = WordPositionEmbedding(1000)
encoder = TransformerEncoder()

In [None]:
encoder(emb(torch.randint(1000, (5, 30))))