In [33]:
import re
import math
import importlib
import copy
import numpy as np
import time

import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

# Load the TensorBoard notebook extension.
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [2]:
!pip3 install pkuseg==0.0.25
!pip3 install https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.0.0/zh_core_web_sm-3.0.0.tar.gz
# then restart runtime

Collecting pkuseg==0.0.25
  Downloading pkuseg-0.0.25-cp37-cp37m-manylinux1_x86_64.whl (50.2 MB)
[K     |████████████████████████████████| 50.2 MB 204 kB/s 
Installing collected packages: pkuseg
Successfully installed pkuseg-0.0.25
Collecting https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.0.0/zh_core_web_sm-3.0.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.0.0/zh_core_web_sm-3.0.0.tar.gz (49.6 MB)
[K     |████████████████████████████████| 49.6 MB 209 kB/s 
[?25hCollecting spacy<3.1.0,>=3.0.0
  Downloading spacy-3.0.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 4.8 MB/s 
[?25hCollecting spacy-pkuseg<0.1.0,>=0.0.27
  Downloading spacy_pkuseg-0.0.28-cp37-cp37m-manylinux2014_x86_64.whl (2.4 MB)
[K     |████████████████████████████████| 2.4 MB 43.3 MB/s 
Collecting catalogue<2.1.0,>=2.0.4
  Downloading catalogue-2.0.6-py3-none-any.

In [2]:
nlp = spacy.load("zh_core_web_sm")
doc = nlp("你好，这里是中国。")
print(doc.text)

for token in doc:
    print(token.text, token.pos_, token.dep_)

你好，这里是中国。
你好 VERB ROOT
， PUNCT punct
这里 PRON nsubj
是 VERB cop
中国 PROPN conj
。 PUNCT punct


# Build Transformer

![transformer](https://miro.medium.com/max/644/1*46c7LPV22532Svcewui37g.png)

## 1. Tokenize

In [3]:
class Tokenize(object):
    def __init__(self, lang):
        self.nlp = importlib.import_module(lang).load()

    def tokenizer(self, sentence):
        sentence = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
        sentence = re.sub(r"[ ]+", " ", sentence)
        sentence = re.sub(r"\!+", "!", sentence)
        sentence = re.sub(r"\,+", ",", sentence)
        sentence = re.sub(r"\?+", "?", sentence)
        sentence = sentence.lower()
        return [tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "]

In [4]:
tokenize = Tokenize('zh_core_web_sm')
tokenize.tokenizer('你好，这里是中国。')

['你好', '，', '这里', '是', '中国', '。']

## 2. Input Embedding

### Token Embedding

In [12]:
class Embedder(nn.Module):
    # defines the layers of the model 
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model

        # Linear layer expects vectors (e.g. one-hot-encoding of words), Embedding Layer expects tokens (e.g. words index)
        # The layer holds a Tensor of dimension: size of the vocabulary * size of each embedding vector
        # calling Embedding(torch.LongTensor([3,4])) will return the embedding vectors corresponding to the word 3 and 4 in your vocabulary
        self.embed = nn.Embedding(vocab_size, d_model) 

    # defines tensor operations that propagate input through the defined layers of the model
    def forward(self, x):
        return self.embed(x)

In [13]:
embedder = Embedder(10, 6)
x = embedder(torch.LongTensor([[1,2,4,5],[4,3,2,9]]))
x

tensor([[[ 0.0213, -0.1896, -0.3202, -0.2448, -1.1269,  1.6894],
         [-1.0196, -0.5152, -1.5480,  0.2170,  0.2013,  0.3572],
         [ 0.4963, -1.1942, -1.1032,  0.0088,  0.4541,  1.1270],
         [ 0.1167,  1.2405,  0.3348,  1.7332, -1.6667, -1.9068]],

        [[ 0.4963, -1.1942, -1.1032,  0.0088,  0.4541,  1.1270],
         [-1.7532,  0.7491, -0.3572, -1.2576, -0.6655,  1.8617],
         [-1.0196, -0.5152, -1.5480,  0.2170,  0.2013,  0.3572],
         [-2.2407, -0.2142,  0.9365,  1.0071, -0.5728,  1.0450]]],
       grad_fn=<EmbeddingBackward0>)

### Position Encoding

In [14]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len=80):
        super().__init__()
        self.d_model = d_model

        # create constant positional encoding matrix
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))

        # (max_seq_len, d_model) -> (1, max_seq_len, d_model)
        pe = pe.unsqueeze(0)

        # Buffers won’t be returned in model.parameters(), so that the optimizer won’t have a change to update them.
        # saved as persistent state, such as running mean in BatchNorm
        self.register_buffer('pe', pe)

    def forward(self, x):
        # scale input embedding to make it relatively larger, so that adding pe won't distort the original information of input embedding
        x = x * math.sqrt(self.d_model)

        # tensor.size(dim) count # of elements along given axis
        # x : [batch?, seq_len, d_model]
        seq_len = x.size(1)

        # add positional encoding to input embedding
        x = x + Variable(self.pe[:, :seq_len], requires_grad = False)

        return x


In [15]:
pos_enc = PositionalEncoder(6)
x = pos_enc(x)
x

tensor([[[ 0.0523,  0.5355, -0.7843,  0.4004, -2.7604,  5.1382],
         [-1.6560, -0.2630, -3.7897,  1.5315,  0.4930,  1.8749],
         [ 2.1250, -1.9294, -2.6980,  1.0215,  1.1122,  3.7607],
         [ 0.4270,  4.0289,  0.8267,  5.2456, -4.0827, -3.6708]],

        [[ 1.2157, -1.9251, -2.7023,  1.0215,  1.1122,  3.7607],
         [-3.4530,  2.8339, -0.8727, -2.0805, -1.6302,  5.5602],
         [-1.5882, -0.2662, -3.7875,  1.5315,  0.4930,  1.8749],
         [-5.3474,  0.4657,  2.3004,  3.4670, -1.4031,  3.5596]]],
       grad_fn=<AddBackward0>)

## 3. Transformer Encoder

![img](https://raw.githubusercontent.com/leox1v/dl20/b3d5b5556d1b2bd360a4abeef4fd82f056ab0301/imgs/transformer-block.svg)

### (1) Multi-Head-Attention Layer

![attention](https://www.researchgate.net/publication/345482934/figure/fig2/AS:955463785013258@1604811726722/Internal-structure-of-the-Multi-Headed-Self-Attention-Mechanism-in-a-Transformer-block.png)

In [16]:
def attention(q, k, v, d_k, mask=None, dropout=None):
    # scale qk to make qk follow std. dist. again
    scores = torch.matmul(q, k.transpose(-2, -1))/math.sqrt(d_k)
    print("in attention: q.size(): ", q.size(), "k.T.size(): ", k.transpose(-2, -1).size(), "score.size(): ", scores.size())

    # mask: 1) in encoder & decoder, attention for padding should be zero 2) prevent decoder from peeking next word  
    # mask==0 represent tokens which added for padding, replace 0 with -1e9 so that it become 0 after softmax
    if mask is not None:
        mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask==0, -1e9)

    # score: (bs, heads, seq_len<# of query>, seq_len<# of score for respective query>), do softmax along the second dimension
    scores = F.softmax(scores, dim=-1)

    # apply dropout layer on scores
    if dropout is not None:
        scores = dropout(scores)

    # output: (bs, heads, seq_len<# of query>, d_model<# of dim of contextulized embedding>)
    output = torch.matmul(scores, v)
    return output

In [17]:
# class MultiHeadAttention_not_preferred(nn.Module):
#     def __init__(self, heads, d_model, dropout = 0.1):
#         super().__init__()

#         self.d_model = d_model
#         self.d_k = d_model // heads
#         self.h = heads

#         self.q_linear = nn.Linear(d_model, d_model)
#         self.v_linear = nn.Linear(d_model, d_model)
#         self.k_linear = nn.Linear(d_model, d_model)

#         self.dropout = nn.Dropout(dropout)
#         self.out = nn.Linear(d_model, d_model)

#     def forward(self, q, k, v, mask=None):
#         # batch_size
#         bs = q.size(0)

#         # perform linear operation, and split into N heads
#         # view: reshape the data, the size -1 is inferred from other dimensions
#         k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
#         q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
#         v = self.v_linear(v).view(bs, -1, self.h, self.d_k)

#         # transpose to get dimensions bs * heads * seq_len * d_model
#         k = k.transpose(1,2)
#         q = q.transpose(1,2)
#         v = v.transpose(1,2)

#         # calculate attention
#         scores = attention(q, k, v, self.d_k, mask, self.dropout)

#         # concatenate heads
#         concat = scores.transpose(1,2).contiguous().view(bs, -1, self.d_model)

#         # dense concat using linear layer
#         output = self.out(concat)

#         return output

In [18]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout = 0.1):
        super().__init__()

        self.d_model = d_model
        self.h = heads

        self.q_linear = nn.Linear(d_model, d_model * heads)
        self.v_linear = nn.Linear(d_model, d_model * heads)
        self.k_linear = nn.Linear(d_model, d_model * heads)

        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(heads * d_model, d_model)

    def forward(self, q, k, v, mask=None):
        """
        Args:
            q,k,v: The input embedding of shape [bs, seq_len, d_model].
            
        Returns:
            Self attention tensor of shape [bs, seq_len, d_model].
        """
        # batch_size
        bs = q.size(0)

        # perform linear operation, and split into N heads
        # view: reshape the data, the size -1 is inferred from other dimensions
        k = self.k_linear(k).view(bs, -1, self.h, self.d_model)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_model)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_model)
        print("before attention: k.size(): ", k.size())

        # transpose to get dimensions (bs, heads, seq_len, d_model)
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)

        # calculate attention to arrive at (bs, heads, seq_len, d_model)
        scores = attention(q, k, v, self.d_model, mask, self.dropout)
        print("after attention: scores.size(): ", scores.size())

        # swap heads, seq_len back
        # then fold the heads into the d_model dimension to arrive at (bs, seq_len, heads * d_model)
        concat = scores.transpose(1,2).contiguous().view(bs, -1, self.h * self.d_model)
        print("before dense, concat.size(): ", concat.size())

        # concatenate heads, dense concat using linear layer to arrive at shape (bs, seq_len, d_model)
        output = self.out(concat)
        print("after dense, concat.size(): ", output.size())

        return output

In [19]:
multi_head = MultiHeadAttention(3, 6)
x = multi_head(x, x, x)
x

before attention: k.size():  torch.Size([2, 4, 3, 6])
in attention: q.size():  torch.Size([2, 3, 4, 6]) k.T.size():  torch.Size([2, 3, 6, 4]) score.size():  torch.Size([2, 3, 4, 4])
after attention: scores.size():  torch.Size([2, 3, 4, 6])
before dense, concat.size():  torch.Size([2, 4, 18])
after dense, concat.size():  torch.Size([2, 4, 6])


tensor([[[-5.1469e-01,  2.8398e-01, -6.6061e-01,  7.8064e-01, -1.2111e+00,
           8.7032e-01],
         [ 8.4251e-03, -1.1492e-03, -4.1300e-01,  6.6197e-01, -6.5142e-01,
           6.0440e-02],
         [ 5.7709e-02,  8.4160e-02, -5.5680e-01,  7.2970e-01, -7.4402e-01,
          -9.4615e-03],
         [-5.3445e-01, -1.8519e+00, -2.5510e-01, -4.8169e-01, -1.3785e+00,
           1.9085e-01]],

        [[-5.8775e-01, -1.7358e+00,  3.7881e-02, -5.7792e-02, -8.5012e-01,
           7.1373e-01],
         [-6.7315e-01, -1.4729e+00,  1.9081e-01,  2.3022e-01, -4.0392e-01,
           6.0857e-01],
         [-3.2952e-01, -1.4959e+00,  1.7805e-01, -3.5701e-01, -2.3952e-01,
           9.5266e-02],
         [-5.4317e-01, -7.2857e-01,  8.5229e-02,  5.8847e-02,  5.7733e-02,
          -7.9071e-02]]], grad_fn=<AddBackward0>)

### (2) LayerNorm

$y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta$

In [20]:
class NormLayer(nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super().__init__()

        self.size = d_model

        # train two learnable parameters
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))

        # a value added to the denominator for numerical stability
        self.eps = eps

    def forward(self, x):
        # calculate mean over last axis: d_model
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

In [21]:
norm = NormLayer(6)
x = norm(x)
x

tensor([[[-5.1923e-01,  4.2443e-01, -6.9165e-01,  1.0113e+00, -1.3420e+00,
           1.1172e+00],
         [ 1.4230e-01,  1.2108e-01, -7.9159e-01,  1.5906e+00, -1.3199e+00,
           2.5756e-01],
         [ 2.4953e-01,  2.9998e-01, -9.2252e-01,  1.5312e+00, -1.2796e+00,
           1.2142e-01],
         [ 2.4366e-01, -1.5008e+00,  6.1355e-01,  3.1351e-01, -8.7395e-01,
           1.2040e+00]],

        [[-2.0645e-01, -1.5650e+00,  5.3393e-01,  4.2071e-01, -5.1693e-01,
           1.3337e+00],
         [-5.5499e-01, -1.6124e+00,  5.8731e-01,  6.3941e-01, -1.9902e-01,
           1.1397e+00],
         [ 4.7598e-02, -1.8950e+00,  8.9299e-01,  1.8146e-03,  1.9750e-01,
           7.5512e-01],
         [-9.9378e-01, -1.5177e+00,  7.8200e-01,  7.0745e-01,  7.0430e-01,
           3.1771e-01]]], grad_fn=<AddBackward0>)

### (3) Feed Forward Layer

In [22]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout=0.1):
        super().__init__()

        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        x = F.relu(self.linear_1(x))
        x = self.dropout(x)
        x = self.linear_2(x)

        return x

In [23]:
ff = FeedForward(6)
x = ff(x)
x

tensor([[[ 0.3133, -0.0840, -0.1850, -0.1372,  0.6160, -0.2736],
         [ 0.2534,  0.0568, -0.0225,  0.1077,  0.6298, -0.1486],
         [ 0.2518,  0.0036,  0.0836, -0.0429,  0.7081, -0.0291],
         [ 0.2077,  0.0863, -0.1182, -0.2939,  0.4759,  0.0521]],

        [[-0.0283,  0.0900, -0.1073, -0.2666,  0.3705,  0.0846],
         [-0.1590,  0.0110, -0.1929, -0.2772,  0.4851,  0.0890],
         [-0.1223,  0.1114, -0.1816, -0.4036,  0.3201,  0.1907],
         [-0.4788, -0.0825, -0.2846, -0.1588,  0.4718,  0.0833]]],
       grad_fn=<AddBackward0>)

### (4) Encoder

In [24]:
# one block
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = NormLayer(d_model)
        self.norm_2 = NormLayer(d_model)

        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.ff = FeedForward(d_model, dropout=dropout)

        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)

    def forward(self, x, mask):
        # first resdisual network
        x_prime = self.dropout_1(self.attn(x, x, x, mask))
        x = self.norm_1(x_prime + x)

        # second residual network
        x_prime = self.dropout_2(self.ff(x))
        x = self.norm_2(x_prime + x)

        return x

In [25]:
# We can then build a convenient cloning function that can generate multiple layers:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [26]:
class Encoder(nn.Module): 
    def __init__(self, vocab_size, d_model, N, heads, dropout):
        super().__init__()
        # N: how many times to repeat the encoder block
        self.N = N

        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)

        # layers: a list of encoder layers
        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
        self.norm = NormLayer(d_model)

    def forward(self, src, mask):
        x = self.embed(src)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, mask)

        return self.norm(x)

In [27]:
# vocab_size = 10, d_model = 6, N = 2, heads = 3
ec = Encoder(10, 6, 2, 3, 0.1)
enc_outs = ec(torch.LongTensor([[1,2,4,5],[4,3,2,9]]), None)
enc_outs

before attention: k.size():  torch.Size([2, 4, 3, 6])
in attention: q.size():  torch.Size([2, 3, 4, 6]) k.T.size():  torch.Size([2, 3, 6, 4]) score.size():  torch.Size([2, 3, 4, 4])
after attention: scores.size():  torch.Size([2, 3, 4, 6])
before dense, concat.size():  torch.Size([2, 4, 18])
after dense, concat.size():  torch.Size([2, 4, 6])
before attention: k.size():  torch.Size([2, 4, 3, 6])
in attention: q.size():  torch.Size([2, 3, 4, 6]) k.T.size():  torch.Size([2, 3, 6, 4]) score.size():  torch.Size([2, 3, 4, 4])
after attention: scores.size():  torch.Size([2, 3, 4, 6])
before dense, concat.size():  torch.Size([2, 4, 18])
after dense, concat.size():  torch.Size([2, 4, 6])


tensor([[[ 0.1669, -0.8777, -0.2347, -0.8343,  1.8560, -0.0762],
         [ 0.4490,  0.9609, -1.1416, -0.6694,  1.2113, -0.8102],
         [ 1.4555, -0.9952,  0.1804, -1.0120,  0.8044, -0.4331],
         [ 0.4101,  0.0764,  0.2366, -1.2998,  1.4955, -0.9187]],

        [[ 0.1294, -1.4758, -0.0938, -0.6330,  1.3876,  0.6856],
         [ 0.7848,  0.2365, -1.7956, -0.2094,  1.0286, -0.0450],
         [ 0.1697,  0.7690, -1.3142, -0.4046,  1.4352, -0.6551],
         [ 0.6588, -1.6030,  0.0857,  0.2879,  1.2189, -0.6483]]],
       grad_fn=<AddBackward0>)

## 4. Transformer Decoder

![img](https://raw.githubusercontent.com/leox1v/dl20/b3d5b5556d1b2bd360a4abeef4fd82f056ab0301/imgs/classifier.svg)

In [28]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()

        self.norm_1 = NormLayer(d_model)
        self.norm_2 = NormLayer(d_model)
        self.norm_3 = NormLayer(d_model)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        
        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.ff = FeedForward(d_model, dropout=dropout)

    def forward(self, x, enc_outputs, src_mask, trg_mask):
        # first resdisual network in decoder
        x_prime = self.dropout_1(self.attn_1(x, x, x, trg_mask))
        x = self.norm_1(x_prime + x)

        # second residual network
        # attention(q, k, v, mask)
        x_prime = self.dropout_2(self.attn_2(x, enc_outputs, enc_outputs, src_mask))
        x = self.norm_2(x_prime + x)

        # third residual network
        x_prime = self.dropout_3(self.ff(x))
        x = self.norm_3(x_prime + x)

        return x

In [29]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, dropout):
        super().__init__()

        self.N = N

        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)

        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
        self.norm = NormLayer(d_model)

    def forward(self, trg, enc_outputs, src_mask, trg_mask):
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, enc_outputs, src_mask, trg_mask)

        return self.norm(x)

In [30]:
# vocab_size = 10, d_model = 6, N = 2, heads = 3
dec = Decoder(10, 6, 2, 3, 0.1)
dec_outs = dec(torch.LongTensor([[4,3,2,9], [1,2,4,5]]), enc_outs, None, None)
dec_outs

before attention: k.size():  torch.Size([2, 4, 3, 6])
in attention: q.size():  torch.Size([2, 3, 4, 6]) k.T.size():  torch.Size([2, 3, 6, 4]) score.size():  torch.Size([2, 3, 4, 4])
after attention: scores.size():  torch.Size([2, 3, 4, 6])
before dense, concat.size():  torch.Size([2, 4, 18])
after dense, concat.size():  torch.Size([2, 4, 6])
before attention: k.size():  torch.Size([2, 4, 3, 6])
in attention: q.size():  torch.Size([2, 3, 4, 6]) k.T.size():  torch.Size([2, 3, 6, 4]) score.size():  torch.Size([2, 3, 4, 4])
after attention: scores.size():  torch.Size([2, 3, 4, 6])
before dense, concat.size():  torch.Size([2, 4, 18])
after dense, concat.size():  torch.Size([2, 4, 6])
before attention: k.size():  torch.Size([2, 4, 3, 6])
in attention: q.size():  torch.Size([2, 3, 4, 6]) k.T.size():  torch.Size([2, 3, 6, 4]) score.size():  torch.Size([2, 3, 4, 4])
after attention: scores.size():  torch.Size([2, 3, 4, 6])
before dense, concat.size():  torch.Size([2, 4, 18])
after dense, concat

tensor([[[ 0.7530,  0.5741,  0.0394,  1.0241, -0.8825, -1.5081],
         [-1.4424, -0.0928,  1.0975,  1.1407, -0.6325, -0.0706],
         [ 0.0489,  0.1028, -1.8282, -0.1047,  0.7363,  1.0449],
         [ 0.8701,  1.0084, -0.3340,  0.7248, -1.0488, -1.2204]],

        [[-0.2432,  1.6191,  0.3136,  0.3459, -1.1396, -0.8958],
         [ 0.8243, -0.5527, -1.4423,  1.3680,  0.0494, -0.2467],
         [ 0.9216,  0.8613, -0.0116,  0.6389, -0.9859, -1.4243],
         [ 1.2805,  0.2038, -0.2395,  0.8916, -1.3719, -0.7644]]],
       grad_fn=<AddBackward0>)

## 5. Complete Transformer

In [31]:
class Transformer(nn.Module):  
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads, dropout)
        self.decoder = Decoder(trg_vocab, d_model, N, heads, dropout)
        self.out = nn.Linear(d_model, trg_vocab)
        
    def forward(self, src, trg, src_mask, trg_mask):
        enc_outputs = self.encoder(src, src_mask)
        dec_output = self.decoder(trg, enc_outputs, src_mask, trg_mask)
        output = self.out(dec_output)
        return output

In [32]:
# vocab_size = 10, d_model = 6, N = 2, heads = 3, dropout = 0.1
trans = Transformer(10, 10, 6, 2, 3, 0.1)
trans(torch.LongTensor([[1,2,4,5],[4,3,2,9]]), torch.LongTensor([[4,3,2,9], [1,2,4,5]]), None, None)

before attention: k.size():  torch.Size([2, 4, 3, 6])
in attention: q.size():  torch.Size([2, 3, 4, 6]) k.T.size():  torch.Size([2, 3, 6, 4]) score.size():  torch.Size([2, 3, 4, 4])
after attention: scores.size():  torch.Size([2, 3, 4, 6])
before dense, concat.size():  torch.Size([2, 4, 18])
after dense, concat.size():  torch.Size([2, 4, 6])
before attention: k.size():  torch.Size([2, 4, 3, 6])
in attention: q.size():  torch.Size([2, 3, 4, 6]) k.T.size():  torch.Size([2, 3, 6, 4]) score.size():  torch.Size([2, 3, 4, 4])
after attention: scores.size():  torch.Size([2, 3, 4, 6])
before dense, concat.size():  torch.Size([2, 4, 18])
after dense, concat.size():  torch.Size([2, 4, 6])
before attention: k.size():  torch.Size([2, 4, 3, 6])
in attention: q.size():  torch.Size([2, 3, 4, 6]) k.T.size():  torch.Size([2, 3, 6, 4]) score.size():  torch.Size([2, 3, 4, 4])
after attention: scores.size():  torch.Size([2, 3, 4, 6])
before dense, concat.size():  torch.Size([2, 4, 18])
after dense, concat

tensor([[[ 0.4984, -0.5855,  0.3687,  0.0863,  0.5848,  0.0384,  0.0076,
           0.5324,  0.1880,  0.3326],
         [-0.0072, -0.6426,  0.3195,  0.4552,  0.2872,  0.1384, -0.3262,
           0.8380,  0.6184,  0.0619],
         [-0.1415, -0.0024, -0.1499,  0.0955,  0.9216,  0.3525, -0.5664,
           0.5567,  0.1013, -0.3077],
         [ 0.3067, -0.7626,  0.2457, -0.0944,  0.0869,  0.0742,  0.5311,
          -0.0109, -0.1188,  0.5765]],

        [[-0.1160, -0.2672, -0.1414, -0.1742,  0.8364,  0.2723,  0.0569,
           0.4402, -0.1929,  0.4056],
         [ 0.0756, -0.1109, -0.0171,  0.0518,  0.9061,  0.2467, -0.4086,
           0.5244,  0.0728, -0.1393],
         [ 0.3312, -0.5124,  0.2151,  0.0143,  0.6618,  0.1524,  0.0024,
           0.5341,  0.0918,  0.3563],
         [ 1.0206, -0.7058,  0.7884,  0.3457,  0.0966, -0.2502, -0.0772,
           0.1360,  0.4390, -0.1667]]], grad_fn=<AddBackward0>)

# Training Transformer

[en blog tutorial](https://towardsdatascience.com/how-to-code-the-transformer-in-pytorch-24db27c8f9ec)

In [None]:
d_model = 512
heads = 8
N = 6
src_vocab = len(EN_TEXT.vocab)
trg_vocab = len(FR_TEXT.vocab)
model = Transformer(src_vocab, trg_vocab, d_model, N, heads)

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
        
# this code is very important! It initialises the parameters with a
# range of values that stops the signal fading or getting too big.
# See this blog for a mathematical explanation.
optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [None]:
# batch = next(iter(train_iter))

# from torchtext import data

# class MyIterator(data.Iterator):
#     def create_batches(self):
#         if self.train:
#             def pool(d, random_shuffler):
#                 for p in data.batch(d, self.batch_size * 100):
#                     p_batch = data.batch(
#                         sorted(p, key=self.sort_key),
#                         self.batch_size, self.batch_size_fn)
#                     for b in random_shuffler(list(p_batch)):
#                         yield b
#             self.batches = pool(self.data(), self.random_shuffler)
            
#         else:
#             self.batches = []
#             for b in data.batch(self.data(), self.batch_size,
#                                           self.batch_size_fn):
#                 self.batches.append(sorted(b, key=self.sort_key))


def train_model(epochs, print_every=100):
    model.train()
    
    start = time.time()
    temp = start
    
    total_loss = 0
    
    for epoch in range(epochs):
       
        for i, batch in enumerate(train_iter):
            src = batch.English.transpose(0,1)
            input_pad = EN_TEXT.vocab.stoi['<pad>']
            # creates mask with 0s wherever there is padding in the input
            src_mask = (src != input_pad).unsqueeze(1)

            trg = batch.French.transpose(0,1)
            trg_input = trg[:, :-1] 
            # the words we are trying to predict
            targets = trg[:, 1:].contiguous().view(-1)
            # the French sentence we input has all words except
            # the last, as it is using each word to predict the next
            # create mask as before
            target_pad = FR_TEXT.vocab.stoi['<pad>']
            target_msk = (trg_input != target_pad).unsqueeze(1)
            size = trg_input.size(1) # get seq_len for matrix
            nopeak_mask = np.triu(np.ones(1, size, size), k=1).astype('uint8')
            nopeak_mask = Variable(torch.from_numpy(nopeak_mask) == 0)
            trg_mask = target_msk & nopeak_mask

                
            # src_mask, trg_mask = create_masks(src, trg_input)
            preds = model(src, trg_input, src_mask, trg_mask)
            results = trg[:, 1:].contiguous().view(-1)
            optim.zero_grad()
            
            loss = F.cross_entropy(preds.view(-1, preds.size(-1)), results, ignore_index=target_pad)
            loss.backward()
            optim.step()
            
            total_loss += loss.data[0]
            if (i + 1) % print_every == 0:
                loss_avg = total_loss / print_every
                print("time = %dm, epoch %d, iter = %d, loss = %.3f, %ds per %d iters" % \
                      ((time.time() - start) // 60, epoch + 1, i + 1, loss_avg, time.time() - temp, print_every))
                total_loss = 0
                temp = time.time()

# Testing Transformer


In [None]:
def translate(model, src, max_len = 80, custom_string=False):
    
    model.eval()
    if custom_sentence == True:
            src = tokenize_en(src)
            sentence=\
            Variable(torch.LongTensor([[EN_TEXT.vocab.stoi[tok] for tok
            in sentence]])).cuda()
        src_mask = (src != input_pad).unsqueeze(-2)
        e_outputs = model.encoder(src, src_mask)
        
        outputs = torch.zeros(max_len).type_as(src.data)
        outputs[0] = torch.LongTensor([FR_TEXT.vocab.stoi['<sos>']])
        
    for i in range(1, max_len):    
                
            trg_mask = np.triu(np.ones((1, i, i),
            k=1).astype('uint8')
            trg_mask= Variable(torch.from_numpy(trg_mask) == 0).cuda()
            
            out = model.out(model.decoder(outputs[:i].unsqueeze(0),
            e_outputs, src_mask, trg_mask))
            out = F.softmax(out, dim=-1)
            val, ix = out[:, -1].data.topk(1)
            
            outputs[i] = ix[0][0]
            if ix[0][0] == FR_TEXT.vocab.stoi['<eos>']:
                break
    return ' '.join([FR_TEXT.vocab.itos[ix] for ix in outputs[:i]])