In [2]:
from IPython.display import Image
import numpy as np
import torch
import torch.nn as nn
import math
import copy
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
print("Torch version:", torch.__version__)

Torch version: 1.9.0+cu102


# **Attention Is All You Need (Transformer)**

## **Core Idea of the Paper**

### ***Problem***
*   In sequence-to-sequence problems such as the neural machine translation, the first proposals were based on the use of RNNs in an encoder-decoder architecture.
*    The best performing models also connect the encoder and decoder through an attention mechanism.
*   These architectures have a great limitation when working with long sequences
*   In the encoder, the hidden state in every step is associated with a certain word in the input sentence, usually one of the most recent. Therefore, if the decoder only accesses the last hidden state of the decoder, it will lose relevant information about the first elements of the sequence.

### **Solution**

*   This paper propose a new simple network architecture, the Transformer,
based solely on attention mechanisms to draw global dependencies between input and output.
*   Instead of paying attention to the last state of the encoder as is usually done with RNNs, in each step of the decoder we look at all the states of the encoder, being able to access information about all the elements of the input sequence.
*   The total computational complexity per layer
*   The amount of computation that can be parallelized, as measured by the minimum number of sequential operations required.
*   The path length between long-range dependencies in the network. Learning long-range dependencies is a key challenge in many sequence transduction tasks.
*   List item












## ***Model Architecture***

<h4 align="center">The Transformer - model architecture.</h4>
<center>
<img src="https://drive.google.com/uc?id=1Q9u7Elc7bbk69cd4ToG2F199ZuweP1s1" width="400" height="500">
</center>



###   ***Input Embedding and Softmax***

With this layer, we convert the input tokens and output tokens to vectors of dimension $d_{\text{model}}$ using a learned embedding. We also use the usual learned linear transformation and softmax function to convert the decoder output to predicted next-token probabilities. In the embedding layers, we multiply those weights by $\sqrt{d_{\text{model}}}$.                                                 

In [3]:
class Embeddings(nn.Module):
  def __init__(self, d_model, vocab):
    super(Embeddings, self).__init__()
    self.lut = nn.Embedding(vocab, d_model)
    self.d_model = d_model

  def forward(self, x):
    return self.lut(x) * math.sqrt(self.d_model)

###   ***Positional Encoding***

In order for the model to make use of the
order of the sequence, we must inject some information about the relative or absolute position of the tokens in the senquence.
In this work, we use sine and cosine functions of different frequencies:

$$PE_{(pos,2i)} = sin(pos / 10000^{2i/d_{\text{model}}})$$

$$PE_{(pos,2i+1)} = cos(pos / 10000^{2i/d_{\text{model}}})$$    

where $pos$ is the position and $i$ is the dimension.  That is, each dimension of the positional encoding corresponds to a sinusoid.  The wavelengths form a geometric progression from $2\pi$ to $10000 \cdot 2\pi$.  We chose this function because we hypothesized **it would allow the model to easily learn to attend by relative positions, since for any fixed offset $k$, $PE_{pos+k}$ can be represented as a linear function of $PE_{pos}$**.

In [3]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        PE = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        PE[:, 0::2] = torch.sin(position * div_term)
        PE[:, 1::2] = torch.cos(position * div_term)
        PE = PE.unsqueeze(0)
        self.register_buffer('PE', PE)
        
    def forward(self, x):
        x = x + self.PE[:, :x.size(1)]
        return self.dropout(x)

###   ***Scaled Dot-Product Attention***
<br>
<center>
<img src="https://drive.google.com/uc?id=1s6mn-NtXM0ux0KaCmacE6QBBmnHOglgW" width="400" height="300">
</center>

An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum of the values, where the weight assigned to each value is computed by a compatibility function of the query with the corresponding key.

In practice, we compute the attention function on a set of queries simultaneously, packed together into a matrix $Q$.   The keys and values are also packed together into matrices $K$ and $V$.  We compute the matrix of outputs as:                      
                                                                 
$$                                                                         
   \mathrm{Attention}(Q, K, V) = \mathrm{softmax}(\frac{QK^T}{\sqrt{d_k}})V               
$$   

In [4]:
class ScaledDotProductAttention(nn.Module):
  ''' Scaled Dot-Product Attention '''

  def __init__(self, d_k, dropout=0.1):
    super().__init__()
    self.d_k = d_k
    self.dropout = nn.Dropout(dropout)

  def forward(self, q, k, v, mask=None):

    attn = torch.matmul(q / math.sqrt(self.d_k), k.transpose(2, 3))

    if mask is not None:
        attn = attn.masked_fill(mask == 0, -1e9)

    attn = self.dropout(F.softmax(attn, dim=-1))
    output = torch.matmul(attn, v)

    return output, attn

###   ***Multi-Head Attention***
<br>
<center>
<img src="https://drive.google.com/uc?id=1uKAOwRZ6bL48WoAVar9ku06-sgjRCsi5" width="400" height="300">
</center>

Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. With a single attention head, averaging inhibits this.                                            
$$    
\mathrm{MultiHead}(Q, K, V) = \mathrm{Concat}(\mathrm{head_1}, ..., \mathrm{head_h})W^O    \\                                           
    \text{where}~\mathrm{head_i} = \mathrm{Attention}(QW^Q_i, KW^K_i, VW^V_i)                                
$$                                                                                                                 

Where the projections are parameter matrices $W^Q_i \in \mathbb{R}^{d_{\text{model}} \times d_k}$, $W^K_i \in \mathbb{R}^{d_{\text{model}} \times d_k}$, $W^V_i \in \mathbb{R}^{d_{\text{model}} \times d_v}$ and $W^O \in \mathbb{R}^{hd_v \times d_{\text{model}}}$.                                          

In [5]:
class MultiHeadAttention(nn.Module):
  ''' Multi-Head Attention module '''

  def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
    super().__init__()

    self.n_head = n_head
    self.d_k = d_k
    self.d_v = d_v

    self.w_q = nn.Linear(d_model, n_head * d_k)
    self.w_k = nn.Linear(d_model, n_head * d_k)
    self.w_v = nn.Linear(d_model, n_head * d_v)
    self.FC = nn.Linear(n_head * d_v, d_model)

    self.attention = ScaledDotProductAttention(d_k)

    self.dropout = nn.Dropout(dropout)
    self.layerNorm = nn.LayerNorm(d_model, eps=1e-6)


  def forward(self, Q, K, V, mask=None):

    residual = Q

    # Pass through the pre-attention projection: b x lq x (n*dv)
    # Separate different heads: b x lq x n x dv
    Q = self.w_q(Q).view(Q.size(0), Q.size(1), self.n_head, self.d_k)
    K = self.w_k(K).view(K.size(0), K.size(1), self.n_head, self.d_k)
    V = self.w_v(V).view(V.size(0), V.size(1), self.n_head, self.d_v)

    # Transpose for attention dot product: b x n x lq x dv
    Q, K, V = Q.transpose(1, 2), K.transpose(1, 2), V.transpose(1, 2)

    if mask is not None:
        mask = mask.unsqueeze(1)   # For head axis broadcasting.

    Q, attn = self.attention(Q, K, V, mask=mask)

    # Transpose to move the head dimension back: b x lq x n x dv
    # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
    Q = Q.transpose(1, 2).contiguous().view(Q.size(0), Q.size(1), -1)
    Q = self.FC(Q)
    Q = self.dropout(Q)

    return Q, attn

###   ***Position-wise Feed-Forward Networks***

In addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully connected feed-forward network, which is applied to each position separately and identically.  This consists of two linear transformations with a ReLU activation in between.

$$\mathrm{FFN}(x)=\max(0, xW_1 + b_1) W_2 + b_2$$                                       

In [6]:
class PositionwiseFeedForward(nn.Module):
  ''' A two-feed-forward-layer module '''

  def __init__(self, d_model, d_hid, dropout=0.1):
    super().__init__()
    self.W_1 = nn.Linear(d_model, d_hid)
    self.W_2 = nn.Linear(d_hid, d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    x = self.W_1(x)
    x = F.relu(x)
    x = self.W_2(x)
    x = self.dropout(x)

    return x

###   ***Built our Encoder***

Since the core encoder contains N=6 encoder sublayer, we will define an EncoderLayer class for each sublayer.

In [7]:
class EncoderLayer(nn.Module):
  ''' Compose with two layers '''

  def __init__(self, d_model, d_hid, n_head, d_k, d_v, dropout=0.1):
    super(EncoderLayer, self).__init__()
    self.MHA = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
    self.layerNorm = nn.LayerNorm(d_model, eps=1e-6)
    self.FFN = PositionwiseFeedForward(d_model, d_hid, dropout=dropout)

  def forward(self, encoder_input, mask=None):
    res1 = encoder_input
    output1, MHA = self.MHA(encoder_input, encoder_input, encoder_input, mask=mask)
    output1 = res1 + self.layerNorm(output1)
    res2 = output1
    output2 = self.FFN(output1)
    output2 = res2 + self.layerNorm(output2)
    encoder_output = output2
    return encoder_output, MHA

In [None]:
class Encoder(nn.Module):
  "Core encoder is a stack of N layers"
  def __init__(self, layer, N):
    super(Encoder, self).__init__()
    self.layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(N)])
    self.layerNorm = nn.LayerNorm(d_model, eps=1e-6)
        
  def forward(self, x, mask):
    "Pass the input (and mask) through each layer in turn."
    for layer in self.layers:
        x = layer(x, mask)
    return self.layerNorm(x)

###   ***Built our Decoder***

Since the core decoder contains N=6 decoder sublayer, we will define an DecoderLayer class for each sublayer.

In [None]:
class DecoderLayer(nn.Module):
  ''' Compose with three layers '''

  def __init__(self, d_model, d_hid, n_head, d_k, d_v, dropout=0.1):
    super(DecoderLayer, self).__init__()
    self.MMHA = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
    self.MHA = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
    self.layerNorm = nn.LayerNorm(d_model, eps=1e-6)
    self.FFN = PositionwiseFeedForward(d_model, d_hid, dropout=dropout)

  def forward(self, decoder_input, encoder_output, self_attn_mask=None, dec_enc_attn_mask=None):
    res1 = decoder_input
    output1, MMHA = self.MMHA(decoder_input, decoder_input, decoder_input, mask=slf_attn_mask)
    output1 = res1 + self.layerNorm(output1)
    res2 = output1
    output2, MHA = self.MHA(output1, encoder_output, encoder_output, mask=dec_enc_attn_mask)
    output2 = res2 + self.layerNorm(output2)
    res3 = output2
    output3 = self.FFN(output2)
    decoder_output = res3 + output3

    return decoder_output, MMHA, MHA

In [None]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(N)])
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return x

### **Transformer**

In [None]:
def get_pad_mask(seq, pad_idx):
  return (seq != pad_idx).unsqueeze(-2)


def get_subsequent_mask(seq):
    ''' For masking out the subsequent info. '''
  sz_b, len_s = seq.size()
  subsequent_mask = (1 - torch.triu(
      torch.ones((1, len_s, len_s), device=seq.device), diagonal=1)).bool()
  return subsequent_mask

In [4]:
class Transformer(nn.Module):
    ''' A sequence to sequence model with attention mechanism. '''

    def __init__(
            self, n_src_vocab, n_trg_vocab, src_pad_idx, trg_pad_idx,
            d_word_vec=512, d_model=512, d_inner=2048,
            n_layers=6, n_head=8, d_k=64, d_v=64, dropout=0.1, n_position=200):

        super().__init__()

        self.src_pad_idx, self.trg_pad_idx = src_pad_idx, trg_pad_idx
        self.d_model = d_model

        self.encoder = Encoder(
            n_src_vocab=n_src_vocab, n_position=n_position,
            d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
            n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
            pad_idx=src_pad_idx, dropout=dropout, scale_emb=scale_emb)

        self.decoder = Decoder(
            n_trg_vocab=n_trg_vocab, n_position=n_position,
            d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
            n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
            pad_idx=trg_pad_idx, dropout=dropout, scale_emb=scale_emb)

        self.trg_word_prj = nn.Linear(d_model, n_trg_vocab, bias=False)

    def forward(self, src_seq, trg_seq):

        src_mask = get_pad_mask(src_seq, self.src_pad_idx)
        trg_mask = get_pad_mask(trg_seq, self.trg_pad_idx) & get_subsequent_mask(trg_seq)

        enc_output, *_ = self.encoder(src_seq, src_mask)
        dec_output, *_ = self.decoder(trg_seq, trg_mask, enc_output, src_mask)
        seq_logit = self.trg_word_prj(dec_output)
        if self.scale_prj:
            seq_logit *= self.d_model ** -0.5

        return seq_logit.view(-1, seq_logit.size(2))

##   ***Train model***



# **> Todo**




## ***Evaluation***


## > **Todo**

