<a href="https://colab.research.google.com/github/bhadreshpsavani/NLP-Notes/blob/master/TransformersTutorialPytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy
import numpy as np

import random
import math
import time
import sys

In [6]:
SEED=1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic=True

In [7]:
!{sys.executable} -m spacy download en
!{sys.executable} -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 3.6MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907056 sha256=b2a73304954c4e0dd2450849c823aa2a099a26d8ed0bfb1d040256f223c2dce6
  Stored in directory: /tmp/pip-ephem-wheel-cache-1byapm6q/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Inst

In [8]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [13]:
def tokenize_de(text):
  """Tokenizes German text from string into list of strings"""
  return [tok.text for tok in spacy_de.tokenizer(text)] 

def tokenize_en(text):
  """Tokenizes English text from string into list of strings"""
  return [tok.text for tok in spacy_en.tokenizer(text)] 

In [15]:
SRC = Field(tokenize=tokenize_de,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True,
            batch_first=True)

TRG = Field(tokenize=tokenize_en,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True,
            batch_first=True)

In [16]:
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                    fields=(SRC, TRG))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 858kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 242kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 236kB/s]


In [17]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device
)

In [40]:
class Encoder(nn.Module):
  def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_length=100):
    super().__init__()
    self.device = device
    self.tok_embedding = nn.Embedding(input_dim, hid_dim)
    self.pos_embedding = nn.Embedding(max_length, hid_dim)
    self.layers = nn.ModuleList([EncoderLayer(hid_dim,
                                              n_heads,
                                              pf_dim,
                                              dropout,
                                              device)
                                for _ in range(n_layers)
                                              ])
    self.dropout = nn.Dropout(dropout)
    self.sqrt = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

  def forward(self, src, src_mask):

    # src = [batch_size, src_len]
    # src_mask = [batch_size, src_len]

    batch_size = src.shape[0]
    src_len = src.shape[1]

    pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(device)
    # pos = [batch_size, src_len]

    # ex torch.arange(0, 8).unsqueeze(0).repeat(8, 1)
    # tensor([[0, 1, 2, 3, 4, 5, 6, 7],
    #    [0, 1, 2, 3, 4, 5, 6, 7],
    #   [0, 1, 2, 3, 4, 5, 6, 7],
    #   [0, 1, 2, 3, 4, 5, 6, 7],
    #    [0, 1, 2, 3, 4, 5, 6, 7],
    #    [0, 1, 2, 3, 4, 5, 6, 7],
    #    [0, 1, 2, 3, 4, 5, 6, 7],
    #    [0, 1, 2, 3, 4, 5, 6, 7]])

    src = self.dropout((self.token_embedding(src) * self.scale) + self.pos_embedding(pos))
    # src = [batch_size, src_len, hid_dim]

    for layer in self.layers:
      src = layer(src, src_mask)

    # src = [batch_size, src_len, hid_dim]
    
    return src

In [41]:
class EncoderLayer(nn.Module):
  def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
    super().__init__()

    self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
    self.ff_layer_norm = nn.LayerNorm(hid_dim)
    self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
    self.pointwise_feedforward = PointWiseFeedForward(hid_dim, pf_dim, dropout)

    self.dropout = nn.Dropout()

  def forward(self, src, src_mask):
    # src = [batch_size, src_len, hid_dim]
    # src_mask = [batch_size, src_len]

    _src, _ = self.self_attention(src, src, src, src_mask)

    src = self.self_attn_layer_norm(src + self.dropout(_src))
    # src = [batch_size, src_len, hid_dim]
    _src = self.pointwise_feedforward(src)

    src = self.ff_layer_norm(src + self.dropout(_src))
    #src = [batch_size, src_len, hid_dim]

    return src

In [42]:
class MultiHeadAttentionLayer(nn.Module):
  def __init__(self, hid_dim, n_heads, dropout, device):
    super().__init__()

    assert hid_dim % n_heads == 0

    self.hid_dim = hid_dim
    self.n_heads = n_heads
    self.head_dim = hid_dim // n_heads

    self.fc_q = nn.Linear(hid_dim, hid_dim) 
    self.fc_k = nn.Linear(hid_dim, hid_dim)
    self.fc_v = nn.Linear(hid_dim, hid_dim)

    self.fc_o = nn.Linear(hid_dim, hid_dim)
    self.dropout = nn.Dropout(dropout)

    self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

  def forward(self, query, key, value, mask=None):
    batch_size = query.shape[0]
    # query = [batch_size, query_len, hid_dim]
    # key = [batch_size, key_len, hid_dim]
    # value = [batch_size, value_len, dim]

    Q = self.fc_q(query)
    K = self.fc_k(key)
    V = self.fc_v(value)
    # Q = [batch_size, query_len, hid_dim]
    # K = [batch_size, key_len, hid_dim]
    # V = [batch_size, value_len, dim]

    Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
    K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
    V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
    # Q = [batch_size, n_heads, query_len, hid_dim]
    # K = [batch_size, n_heads, key_len, hid_dim]
    # V = [batch_size, n_heads, value_len, dim]

    energy = torch.matmul(Q , K.permute(0, 1, 3, 2))/self.scale
    # enegry = [batch_size, n_heads, query_len, dim]

    if mask is not None:
      energy = energy.mask_fill(mask==0, -1e10)

    attention = torch.softmax(energy, dim=-1)
    # attention = [batch_size, n_heads, query_len, dim]

    x = torch.matmul(self.dropout(attention), V)
    # x = [batch_size, n_heads, query_len, dim]

    x = x.permute(0, 2, 1, 3).contiguous()
    # x = [batch_size, query_len, n_heads, dim]

    x = x.view(batch_size, -1, hid_dim)
    # x = [batch_size, query_len, dim]

    x = self.fc_o(x)
    # x = [batch_size, query_len, dim]

    return x, attention

In [43]:
torch.arange(8).unsqueeze(0).repeat(4, 1).permute(1, 0)

tensor([[0, 0, 0, 0],
        [1, 1, 1, 1],
        [2, 2, 2, 2],
        [3, 3, 3, 3],
        [4, 4, 4, 4],
        [5, 5, 5, 5],
        [6, 6, 6, 6],
        [7, 7, 7, 7]])

In [44]:
class PositionWiseFeedforwardLayer(nn.Module):
  def __init__(self, hid_dim, pf_dim, dropout):
    super().__init__()

    self.fc1 = nn.Linear(hid_dim, pf_dim)
    self.fc2 = nn.Linear(pf_dim, hid_dim)

    self.dropout = nn.Dropout(dropout)

  def forward(self, x):

    # x = [bacth_size, seq_len, hid_dim]

    x = self.dropout(torch.relu(self.fc1(x)))
    # x = [bacth_size, seq_len, pf_dim]

    x = self.fc2(x)
    # x = [bacth_size, seq_len, hid_dim]
    
    return x

In [45]:
class Decoder(nn.Module):
  def __init__(self, output_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_length=100):
    super().__init__()

    self.device = device

    self.tok_embedding = nn.Embedding(output_dim, hid_dim)
    self.pos_embedding = nn.Embedding(max_length, hid_dim)

    self.layers = nn.ModuleList([DecoderLayer(hid_dim,
                                              n_heads,
                                              pf_dim,
                                              dropout,
                                              device)
                                for _ in range(n_layers)
                                 ])
    
    self.fc_out = nn.Linear(hid_dim, output_dim)
    self.dropout = nn.Dropout(dropout)

    self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

  def forward(self, trg, enc_src, trg_mask, src_mask):
    # trg = [batch_size, trg_len]
    # enc_src = [batch_size, src_len, hid_dim]
    # trg_mask = [batch_size, trg_len]
    # src_mask = [batch_size, src_len]

    batch_size = trg.shape[0]
    trg_len = trg.shape[1]

    pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(device)
    # pos = [batch_size, trg_len]
    trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(trg))
    # trg = [batch_size, trg_len, hid_dim]

    for layer in self.layers:
      trg, attention = layer(trg, enc_src, trg_mask, src_mask)
    # trg = [batch_size, trg_len, hid_dim]
    # attention = [batch_size, n_heads, trg_len, hid_dim]

    outputs = self.fc_out(trg)
    # trg = [batch_size, trg_len, hid_dim]

In [46]:
class Decoder(nn.Module):
  def __init__(self, output_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_length=100):
    super().__init__()

    self.device = device

    self.tok_embedding = nn.Embedding(output_dim, hid_dim)
    self.pos_embedding = nn.Embedding(max_length, hid_dim)

    self.layers = nn.ModuleList([DecoderLayer(hid_dim,
                                              n_heads,
                                              pf_dim,
                                              dropout,
                                              device)
                                for _ in range(n_layers)
                                 ])
    
    self.fc_out = nn.Linear(hid_dim, output_dim)
    self.dropout = nn.Dropout(dropout)

    self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

  def forward(self, trg, enc_src, trg_mask, src_mask):
    # trg = [batch_size, trg_len]
    # enc_src = [batch_size, src_len, hid_dim]
    # trg_mask = [batch_size, trg_len]
    # src_mask = [batch_size, src_len]

    batch_size = trg.shape[0]
    trg_len = trg.shape[1]

    pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(device)
    # pos = [batch_size, trg_len]
    trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(trg))
    # trg = [batch_size, trg_len, hid_dim]

    for layer in self.layers:
      trg, attention = layer(trg, enc_src, trg_mask, src_mask)
    # trg = [batch_size, trg_len, hid_dim]
    # attention = [batch_size, n_heads, trg_len, hid_dim]

    outputs = self.fc_out(trg)
    # outputs = [batch_size, trg_len, hid_dim]

    return outputs, attention

In [48]:
class DecoderLayer(nn.Module):
  def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
    super().__init__()

    self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
    self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
    self.ff_layer_norm = nn.LayerNorm(hid_dim)
    self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
    self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
    self.positionwise_feedforward = PositionWiseFeedforwardLayer(hid_dim,
                                                                 pf_dim,
                                                                 dropout)
    self.dropout = nn.Dropout(dropout)
  
  def forward(self, trg, enc_src, trg_mask, src_mask):
    # trg = [batch_size, trg_len]
    # enc_src = [batch_size, src_len, hid_dim]
    # trg_mask = [batch_size, trg_len]
    # src_mask = [batch_size, src_len]

    _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
    trg = self.self_attn_layer_norm(trg+self.dropout(_trg))
    # trg = [batch_size, trg_len, hid_dim]

    _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
    trg = self.enc_attn_layer_norm(trg, self.dropout(_trg))
    # trg = [batch_size, trg_len, hid_dim]
    # attention = [batch_size, n_heads, trg_len, hid_dim]

    _trg = self.positionwise_feedforward(trg)
    trg = self.ff_layer_norm(trg + self.dropout(_trg))
    # trg = [batch_size, trg_len, hid_dim]

    return trg, attention


In [None]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, src_pad_inx, trg_pad_inx, device):
    super().__init__()

    self.encoder = encoder
    self.decoder = decoder
    self.src_pad_inx = src_pad_inx
    self.trg_pad_inx = trg_pad_inx
    self.device =device
  
  def make_src_mask(self, src):
    # src = [batch_size, src_len]

    src_mask = (src != self.src_pad_inx).unsqueeze(1).unsqueeze(2)
    # src mask= [batch_size, 1, 1, src_len]

    return src_mask

  def make_trg_mask(self, trg):
    # trg =  [batch_size, src_len]

    trg_pad_mask = (trg != self.trg_pad_inx).unsqueeze(1).unsqueeze(2)
    # trg pad mask = [batch_size, 1, 1, trg_len]

    trg_len = trg.shape[1]

    trg_sub_mask = torch.tril(torch.ones((trg_len,trg_len), device = self.device)).bool()
    # trg_sub_mask = [trg_len, trg_len] 

    trg_mask = trg_pad_mask&trg_sub_mask
    # trg  mask = [batch_size, 1, trg_len, trg_len]

    return trg_mask
  
  def forward(self, src, trg):
    # src = [batch_size, src_len]
    # trg = [batch_size, src_len]

    src_mask = self.make_src_mask(src)
    # src mask= [batch_size, 1, 1, src_len]
    trg_mask = self.make_trg_mask(trg)
    # trg  mask = [batch_size, 1, trg_len, trg_len]

    enc_src = self.encoder(src, src_mask)
    # enc_src = [batch_size, src_len, hid_dim]

    output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
    #output = [batch size, trg len, output dim]
    #attention = [batch size, n heads, trg len, src len]

    return output
