# Transformers 101

This notebook serves as an exploration of the transformer architecture (Vaswani et. al.) Here, we'll implement in native PyTorch the basic building blocks of the transformer and then put them all together so we have a model architecture to put into `../models`

In the process of putting this together (much like my other exploratory projects) I tried to limit viewing existing code online, and primarily used my notes (pdf attached for anyone interested) as a foundation for this work.

In [28]:
import torch
import math
import torch.nn as nn

We want something with output dims: (sequence_length, output_dim)

In [29]:
def positional_embedding(input_tensor: torch.Tensor, output_dim: int, n=10000): 
    """
    Here, we implement the naive approach from the original 
    paper with the sin and cosine functions. 
    """
    P = torch.zeros((input_tensor.shape[-1], output_dim))
    indices = torch.arange(input_tensor.size(-1))
    i_values = torch.arange(int(output_dim/2))
    denominators = torch.float_power(n, 2*i_values/output_dim)
    P[:, 0::2] = torch.sin(indices.unsqueeze(1) / denominators.unsqueeze(0)) # start at 0, step by 2 sin for even nums
    P[:, 1::2] = torch.cos(indices.unsqueeze(1) / denominators.unsqueeze(0)) # start at 1, step by 2 cos for odd nums
    return P


In [30]:
a = torch.rand((2, 5))
output_dims = 3
positional_embedding(a, output_dims)

tensor([[ 0.0000,  1.0000,  0.0000],
        [ 0.8415,  0.5403,  0.8415],
        [ 0.9093, -0.4161,  0.9093],
        [ 0.1411, -0.9900,  0.1411],
        [-0.7568, -0.6536, -0.7568]])

In [31]:
def attention(x): 
    """
    Simple dot product based attention
    """
    query_layer, key_layer, value_layer = nn.Linear(x.shape[-1], x.shape[-1]), nn.Linear(x.shape[-1], x.shape[-1]), nn.Linear(x.shape[-1], x.shape[-1])
    query, key, value = query_layer(x), key_layer(x), value_layer(x)
    attention_weights  = torch.nn.Softmax(-1)(torch.tensordot(query, key, dims=1))
    return torch.sum(value * attention_weights)

In [32]:
x = torch.rand(1, 12)
attention(x)

tensor(0.0080, grad_fn=<SumBackward0>)

In [33]:
def add_norm(residual: torch.Tensor, hidden: torch.Tensor): 
    if residual.shape != hidden.shape: 
        raise ValueError("Shapes mismatch")
    else: 
        output = residual + hidden # element wise addition
        layer_norm = nn.LayerNorm([residual.shape[-2], residual.shape[-1]])
        return layer_norm(output)

In [34]:
# usage example: 

tensor_a = torch.rand([1, 5, 6]) # batch size, sequence length, embedding dimensions
tensor_b = torch.rand([1, 5, 6])
print(tensor_a)
print(tensor_b)
print(f"Final: {add_norm(tensor_a, tensor_b)}")

tensor([[[0.1045, 0.4312, 0.0299, 0.8858, 0.9968, 0.1152],
         [0.8752, 0.2635, 0.2181, 0.6305, 0.3234, 0.0159],
         [0.5359, 0.7384, 0.2475, 0.3941, 0.5815, 0.7737],
         [0.0514, 0.8584, 0.9550, 0.1863, 0.0683, 0.8739],
         [0.0726, 0.5960, 0.9250, 0.5273, 0.2494, 0.8675]]])
tensor([[[0.9378, 0.8276, 0.3353, 0.4124, 0.3723, 0.7553],
         [0.1377, 0.5286, 0.3779, 0.4345, 0.5243, 0.9111],
         [0.5583, 0.2620, 0.9314, 0.6691, 0.2225, 0.4130],
         [0.9031, 0.4765, 0.7315, 0.8451, 0.5465, 0.3815],
         [0.9565, 0.4849, 0.0307, 0.1105, 0.7993, 0.4004]]])
Final: tensor([[[ 0.0758,  0.9001, -2.5009,  1.0501,  1.3197, -0.5781],
         [-0.0360, -0.8759, -1.6224,  0.1626, -0.6648, -0.3628],
         [ 0.2734, -0.0834,  0.5958,  0.1556, -0.8310,  0.6256],
         [-0.2583,  1.1896,  2.5278,  0.0344, -1.5507,  0.8871],
         [ 0.0259,  0.2226, -0.2535, -1.4633,  0.1003,  0.9343]]],
       grad_fn=<NativeLayerNormBackward0>)


In [42]:
def scaled_dot_product_attention(q, k, d_k):
    # in order to align the dimensions for the dot product, we transpose k along the last two dimensions like this
    return torch.nn.Softmax(-1)(torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k))

d_k and d_v are essentially hyperparameters that are fixed before training. This allows for the query and keys to have the same dimensionality, and for all 3 of them to have consistent dimensionality. In many transformer implementations, d_k and d_v are set to be the same for simplicity but this is not always the case.

In [43]:
def multihead_attention(k, q, v, d_k, d_v, d_model, num_heads):
    """
    Scaled Dot product based multi-head attention
    """
    
    query_layer, key_layer, value_layer = nn.Linear(d_model, num_heads* d_k), nn.Linear(d_model, num_heads* d_k), nn.Linear(d_model, num_heads*d_v)
    k_len, q_len, v_len, batch_size = k.size(1), q.size(1), v.size(1),  q.size(0)
    print(key_layer(k).shape)
    print(query_layer(q).shape)
    print(value_layer(v).shape)
    k, q, v = key_layer(k).view(batch_size, k_len,  num_heads, d_k), query_layer(q).view(batch_size, q_len,  num_heads, d_k), value_layer(v).view(batch_size, v_len,  num_heads, d_v)
    
    residual = q
    
    q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
    attention = scaled_dot_product_attention(q, k, d_k)
    output = torch.matmul(attention, v)

    output = output.transpose(1, 2).contiguous().view(batch_size, q_len, -1)
    concatenated_projection = nn.Linear(num_heads * d_v, d_model, bias=False)

    output = concatenated_projection(output)
    print(r)
    output += residual

    layer_norm = nn.LayerNorm([residual.shape[-2], residual.shape[-1]])
    output = layer_norm(output)

    return output, attention

In [44]:
d_model = 512

# from the paper: To facilitate these residual connections, all sub-layers in the model, as well as the embedding layers, produce outputs of dimension d model 
k, q, v = torch.rand((1, 2, d_model)), torch.rand((1, 2, d_model)), torch.rand((1, 2, d_model))
d_k, d_v = 5, 5
num_heads = 4

multihead_attention(k, q, v, d_k, d_v, d_model, num_heads)

torch.Size([1, 2, 20])
torch.Size([1, 2, 20])
torch.Size([1, 2, 20])
hey
torch.Size([1, 4, 2, 5])
torch.Size([1, 4, 2, 5])
torch.Size([1, 4, 2, 5])
torch.Size([1, 4, 2, 2])


RuntimeError: The size of tensor a (512) must match the size of tensor b (5) at non-singleton dimension 3