##6 - Attention is All You Need
[Paper](https://arxiv.org/abs/1706.03762) [[Tutorial](https://github.com/bentrevett/pytorch-seq2seq/blob/master/6%20-%20Attention%20is%20All%20You%20Need.ipynb)]

This is the basic and vanilla Transformer paper.

Differences between this notebook and the paper:
- Learned positional encoding compared with a static one.
- Standard Adam optimizer with static learning rate.
- No label smoothing.

These changes closely follow BERT's setup & a majority of other Transformer variants.


In [16]:
sandbox_path = "models/"

# Install a newer version of troch text than what is default
!pip install torchtext==0.6.0

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy
import numpy as np

import random
import math
import time

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

!python -m spacy download en
!python -m spacy download de

spacy_en = spacy.load('en')
spacy_de = spacy.load('de')
print(spacy_en)

def tokenize_de(text):
  "Tokenize a German language string"
  # Not reversing for this one
  return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
  return [tok.text for tok in spacy_en.tokenizer(text)]


foo = spacy_en.tokenizer("Hello. how do you do good, sir!")
print(type(foo))
print(foo)
[type(t.text) for t in foo]

SRC = Field(tokenize = tokenize_de, init_token='<sos>', 
            eos_token='<eos>', lower=True, batch_first = True)
TRG = Field(tokenize = tokenize_en, init_token='<sos>', 
            eos_token='<eos>', lower=True, batch_first = True)

train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
                                                    fields = (SRC, TRG))
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")
print(vars(train_data.examples[0]))

SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)
print(f"Unique token in SRC (de) vocab: {len(SRC.vocab)}")
print(f"Unique token in SRC (en) vocab: {len(TRG.vocab)}")

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/en_core_web_sm
-->
/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/de_core_news_sm
-->
/usr/local/google/home/ammarh/anaconda3/lib/python3.7/site-packages/spacy/data/de
You can now load the model via spacy.load('de')
<spacy.lang.en.English object at 0x7f7771244590>
<class 'spacy.tokens.doc.Doc'>
Hello. how do you do good, sir!
Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000
{'src': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im

### Encoder

Similar to the ConvSeq2Seq model, the Transformer's encoder does not attempt to compress the entire source sentence. Instead it produces a sequence of context vectors, one for each input source token. They are called context vectors instead of hidden states because a hidden state at time `t` has only seen tokens preceding it, while a context vector takes into account all source tokens.

The source mask, `src_mask`, is simply the same shape as the source sentence but has a value of 1 when the token in the source sentence is not a <pad> token and 0 when it is a <pad> token. This is used in the encoder layers to mask the multi-headed attention mechanisms, which are used to calculate and apply attention over the source sentence so the model does not pay attention to <pad> tokens. 

In [17]:
class Encoder(nn.Module):
  def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, dropout_p,
               device, max_length = 100):
    super().__init__()

    self.device = device

    self.tok_embedding = nn.Embedding(input_dim, hid_dim)
    self.pos_embedding = nn.Embedding(max_length, hid_dim)

    self.layers = nn.ModuleList([EncoderLayer(hid_dim, n_heads, pf_dim, 
                                              dropout_p, device)
                                              for _ in range(n_layers)])
    self.dropout = nn.Dropout(dropout_p)

    self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

  def forward(self, src, src_mask):
    # src: [batch_sz, src_len]
    # src_mask: [batch_sz, src_len]
    batch_sz = src.shape[0]
    src_len = src.shape[1]

    pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_sz, 1).to(self.device) # [batch_sz, src_len]

    src = self.dropout((self.tok_embedding(src) * self.scale)
                           + self.pos_embedding(pos))  # [batch_sz, src_len, hid_dim]

    for layer in self.layers:
      src = layer(src, src_mask) # [batch_sz, src_len, hid_dim]

    return src


### Encoder Layer

This is where all of the "meat" is. 
- Pass the source sentence and its mask into the multi-headed attention layer
- Perform dropout
- Apply a residual connection
- Pass it through a Layer Normalization layer
- Pass it through a position-wide feedforward layer
- Apply dropout
- Residual connection
- Layer Normalization
This is the output of the layer that is then fed into the next layer as input. Parameters are not shared between layers.

Gist of layer normalization is that it normalizes the values of the features, ie across each hidden dimension to a mean of 0 and std dev of 1. This allows large neural networks to be trained easier. 

In [18]:
class EncoderLayer(nn.Module):
  def __init__(self, hid_dim, n_heads, pf_dim,
               dropout_p, device):
    super().__init__()

    self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
    self.ff_layer_norm = nn.LayerNorm(hid_dim)
    self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, 
                                                  dropout_p, device)
    self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, pf_dim,
                                                                 dropout_p)
    
    self.dropout = nn.Dropout(dropout_p)

  def forward(self, src, src_mask):
    # src: [batch_sz, src_len, hid_dim]
    # src_mask: [batch_sz, src_len]

    # Compute self-attention
    _src, _ = self.self_attention(src, src, src, src_mask)

    src = self.self_attn_layer_norm(src + self.dropout(_src)) # [batch_sz, src_len, hid_dim]

    # Positionwise feedforward
    _src = self.positionwise_feedforward(src) # dim?

    src = self.ff_layer_norm(src + self.dropout(_src)) # [batch_sz, src_len, hid_dim]

    return src



### Multi Head Attention Layer

This is one of the key, novel concepts introduced by the Transformer paper.

Attention can be thought of as queries, keys and values - where the query is used with the key to get an attention vector (usually output of a softmax operation where all values are between 0 & 1 and sum to 1), which is then used to get a weighted sum of values.

Transformer uses scaled dot-product attention:
$$ \text{Attention}(Q, K, V) = \text{Softmax} \big( \frac{QK^T}{\sqrt{d_k}} \big)V $$

This is similar to standard dot product attention but is scaled by $d_k$ , which according to the paper is used to stop the results of the dot products growing large, causing gradients to become too small.

Instead of doing a single attention application the queries, keys & values have their hid_dim split into $h$ heads and the scaled dot-product attention is calculated over all head in parallel. This means instead of paying attention to one concept per attention application, we pay attention to $h$.

Steps:
- Project Q, K & V through a linear layer each `fc_q`, `fc_k`, `fc_v`. These layers map hid_dim to hid_dim. 
- Split hid_dim into n_heads using `.view()`. Each head gets `head_dim = hid_dim // n_heads` sized query, key & value vectors.
- Calculate the energy (unnormalized attention) by multiplyign Q & K and scaling it by square root of head_dim. 
- Mask the energy over the pad tokens, apply softmax and dropout.
- Apply the attention to the value vectors.
- Combine them back together (using `view()`) to create a `hid_dim` size vector.
- Project through `fc_o` linear layer that maps hid_dim to hid_dim.

One thing strange is that dropout is applied straight to the attention. This means that the attention vector will most likely not sum to 1 and we may want to pay full attention to a token but the attention over that token may get set 0 by dropout. This is never explained but is used in almost all Transformer variants (including BERT).


In [19]:
class MultiHeadAttentionLayer(nn.Module):
  def __init__(self, hid_dim, n_heads, dropout_p, device):
    super().__init__()

    assert hid_dim % n_heads == 0

    self.hid_dim = hid_dim
    self.n_heads = n_heads
    self.head_dim = hid_dim // n_heads

    self.fc_q = nn.Linear(hid_dim, hid_dim)
    self.fc_k = nn.Linear(hid_dim, hid_dim)
    self.fc_v = nn.Linear(hid_dim, hid_dim)
    
    self.fc_o = nn.Linear(hid_dim, hid_dim)
    
    self.dropout = nn.Dropout(dropout_p)

    self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

  def forward(self, query, key, value, mask = None):
    batch_sz = query.shape[0]
    # query, key, value: [batch_sz, sent_len, hid_dim]

    Q = self.fc_q(query) # [batch_sz, q_len, hid_dim]
    K = self.fc_k(key) # [batch_sz, k_len, hid_dim]
    V = self.fc_v(value) # [batch_sz, v_len, hid_dim]

    # Split in to n_heads for multi head attention.
    # Permute to get the last 2 dimensions as sent_len, head_dim
    Q = Q.view(batch_sz, -1, self.n_heads, self.head_dim)\
                                          .permute(0,2,1,3) # [batch_sz, n_heads, q_len, head_dim] 
    K = K.view(batch_sz, -1, self.n_heads, self.head_dim)\
                                          .permute(0,2,1,3) # [batch_sz, n_heads, k_len, head_dim] 
    V = V.view(batch_sz, -1, self.n_heads, self.head_dim)\
                                          .permute(0,2,1,3) # [batch_sz, n_heads, v_len, head_dim] 

    energy = torch.matmul(Q, K.permute(0,1,3,2)) / self.scale  # [batch_sz, n_heads, q_len, k_len]

    # AH: Figure out what the mask dim is? how does it work?
    if mask is not None:
      energy = energy.masked_fill(mask == 0, -1e10)

    attention = torch.softmax(energy, dim = -1) # [batch_sz, n_heads, q_len, k_len]

    # This works because k_len and v_len are guaranteed to be the same.
    # because either Q,K,V all come from the src or trg (for self attention head) or 
    # Q comes from trg and K & V come from src (second attention layer of decoder)
    x = torch.matmul(self.dropout(attention), # weird place to apply dropout 
                     V) # [batch_sz, n_heads, q_len, head_dim]

    # join the heads back together to get hid_dim
    x = x.permute(0,2,1,3).contiguous() # [batch_sz, q_len, n_heads, head_dim]
    x = x.view(batch_sz, -1, self.hid_dim) # [batch_sz, q_len, hid_dim]

    x = self.fc_o(x) # [batch_sz, q_len, hid_dim]

    return x, attention

### Position-wise Feedforward Layer

This is a simple block. It simply takes the output of multi-head attention with size `hid_dim` (512) projects it to a much higher dimension of `pf_dim` (2048), applies ReLU and then projects it back down to `hid_dim` (512).

It is unclear/unexplained why this layer is needed. BERT uses the GELU activation instead of ReLU.

In [20]:
class PositionwiseFeedforwardLayer(nn.Module):
  def __init__(self, hid_dim, pf_dim, dropout_p):
    super().__init__()

    self.fc_1 = nn.Linear(hid_dim, pf_dim)
    self.fc_2 = nn.Linear(pf_dim, hid_dim)

    self.dropout = nn.Dropout(dropout_p)

  def forward(self, x):
    # x: [batch_sz, sent_len, hid_dim]

    x = self.dropout(torch.relu(self.fc_1(x))) # [batch_sz, sent_len, pf_dim]
    x = self.fc_2(x) # [batch_sz, sent_len, hid_dim]

    return x



## Decoder

The decoder is similar to encoder, however it now has an additional multi-head attention layer which uses the decoder representation as the query and the encoder representation as the key and value.

The decoder representation after the Nth layer is passed through a linear layer `fc_out`. In PyTorch the softmax operation is contained within our loss function, so we do not explicitly pass it through a softmax at the end.

In [21]:
class Decoder(nn.Module):
  def __init__(self, output_dim, hid_dim, n_layers, n_heads, pf_dim,
               dropout_p, device, max_length = 100):
    super().__init__()

    self.device = device

    self.tok_emedding = nn.Embedding(output_dim, hid_dim)
    self.pos_emedding = nn.Embedding(max_length, hid_dim)

    self.layers = nn.ModuleList([DecoderLayer(hid_dim, n_heads, pf_dim,
                                              dropout_p, device)
                                for _ in range(n_layers)])
    self.fc_out = nn.Linear(hid_dim, output_dim)
    self.dropout = nn.Dropout(dropout_p)
    self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

  def forward(self, trg, enc_src, trg_mask, src_mask):
    # trg: [batch_sz, trg_len]
    # enc_src: [batch_sz, src_len, hid_dim]
    # trg_mask: [batch_sz, trg_len]
    # src_mask: [batch_sz, src_len]

    batch_sz = trg.shape[0]
    trg_len = trg.shape[1]

    pos = torch.arange(0, trg_len).unsqueeze(0).\
                    repeat(batch_sz, 1).to(self.device) # [batch_sz, trg_len]

    trg = self.dropout((self.tok_embedding(trg) * self.scale) + 
                        self.pos_embedding(pos)) # [batch_sz, trg_len, hid_dim]
    
    for layer in self.layers:
      trg, attention = layer(trg, enc_src, trg_mask, src_mask)
      # trg: [batch_sz, trg_len, hid_dim]
      # attention: [batch_sz, n_heads, trg_len, src_len]

    output = self.fc_out(trg) # [batch_sz, trg_len, output_dim]
  
    return output, attention

### Decoder Layer

In [22]:
class DecoderLayer(nn.Module):
  def __init__(self, hid_dim, n_heads, pf_dim, dropout_p, device):
    super().__init__()

    self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
    self.src_attn_layer_norm = nn.LayerNorm(hid_dim)
    self.ff_layer_norm = nn.LayerNorm(hid_dim)

    self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, 
                                                  dropout_p, device)
    self.src_attention = MultiHeadAttentionLayer(hid_dim, n_heads, 
                                                  dropout_p, device)
    self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, pf_dim,
                                                                 dropout_p)
    self.dropout = nn.Dropout(dropout_p)

  def forward(self, trg, enc_src, trg_mask, src_mask):
    # trg: [batch_sz, trg_len, hid_dim]
    # enc_src: [batch_sz, src_len, hid_dim]
    # trg_mask: [batch_sz, trg_len]
    # src_mask: [batch_sz, src_len]

    # self attention
    _trg, _ = self.self_attention(trg, trg, trg, trg_mask) # [batch_sz, trg_len, hid_dim]

    # dropout, residual connection and layer norm
    trg = self.self_attn_layer_norm(trg + self.dropout(_trg))

    # attention over the src sentence
    _trg, attention = self.src_attention(trg, enc_src, enc_src, trg_mask) 
    # _trg: [batch_sz, trg_len, hid_dim]
    # attention: [batch_sz, n_heads, trg_len, src_len]

    trg = self.src_attn_layer_norm(trg + self.dropout(_trg))

    # positionwise feedforward
    _trg = self.positionwise_feedforward(trg)

    trg = self.ff_layer_norm(trg + self.dropout(_trg))

    return trg, attention

