# Working off the Annotated Transformer:
Reference notebook:
http://nlp.seas.harvard.edu/annotated-transformer/#training

Illustrated notebook:
http://jalammar.github.io/illustrated-transformer/

OG article:
https://arxiv.org/pdf/1706.03762.pdf

Masking:
https://medium.com/analytics-vidhya/masking-in-transformers-self-attention-mechanism-bad3c9ec235c

Tensor2Tensor - attention illustration:
https://colab.research.google.com/github/tensorflow/tensor2tensor/blob/master/tensor2tensor/notebooks/hello_t2t.ipynb#scrollTo=OJKU36QAfqOC

I had a lot of trouble with virtual environments in windows jupyter lab - it appears using vend and conda commands don't interact with the jupyter lab launched from anaconda - at least not in a way I'm familiar with - so the solution is to create a virtual environment - pip install packages via a terminal - then load jupyter lab and launch it from anaconda. 

I'm also looking at launch jupyter lab without anaconda - as I think this would fix the issue. 

In [90]:
#check version
#print version
import sys
print(sys.version)

3.9.15 (main, Nov 24 2022, 14:39:17) [MSC v.1916 64 bit (AMD64)]


In [91]:
#see current packages
#!pip install -r requirments.txt --user  
#need to reload jupyter lab for pip install to take effect
!pip list

Package              Version
-------------------- ------------
altair               4.1.0
anyio                3.5.0
argon2-cffi          21.3.0
argon2-cffi-bindings 21.2.0
asttokens            2.0.5
attrs                22.1.0
Babel                2.11.0
backcall             0.2.0
beautifulsoup4       4.11.1
black                22.12.0
bleach               4.1.0
blis                 0.7.9
brotlipy             0.7.0
catalogue            2.0.8
certifi              2022.12.7
cffi                 1.15.1
charset-normalizer   2.1.1
click                8.1.3
colorama             0.4.6
comm                 0.1.2
cryptography         38.0.1
cymem                2.0.7
debugpy              1.5.1
decorator            5.1.1
defusedxml           0.7.1
docker-pycreds       0.4.0
entrypoints          0.4
executing            0.8.3
fastjsonschema       2.16.2
filelock             3.9.0
flake8               6.0.0
flit_core            3.6.0
gitdb                4.0.10
GitPython            3.1.30
GPUti

In [92]:
import sys
print(sys.executable)
import os
import inspect
print(os.path.dirname(inspect.getfile(inspect))+"/site-packages")

C:\Users\AT030915\anaconda3\envs\Annotated_Transformer\python.exe
C:\Users\AT030915\anaconda3\envs\Annotated_Transformer\lib/site-packages


In [93]:
#import modules
import os 
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt 
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP 

#set fasle to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [94]:
# Define convenience helper functions - 
def is_interactive_notebook():
    return __name__ == "__main__"


def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)


def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)


class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None
    

# Encoding/embedding 

 
Actually it looks like embedding/encoding might be interchangeable in some contextes. 

In [95]:
#using super because we want access the the nn.Module class properties

class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many
    other models. The src is the input to the encoder and the tgt is the input to the decoder.
    """

    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator

    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)

    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)

    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [96]:
class Generator(nn.Module):
    "Define standard linear + softmax generation step."

    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        #nn.Linear = Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return log_softmax(self.proj(x), dim=-1)
    


In [97]:
#function for copying sublayers
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [98]:
#the encoder has a stack of N = 6 identical layers


class Encoder(nn.Module):
    "Core encoder is a stack of N layers"

    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)
    
#help(nn.ModuleList)    

In [99]:
# now use a layerNorm to create connection around each two- sublayers
#reduces computation time by normlising the activation of the neurons 

#the output of each sub-layer is LayerNorm(x+Sublayer(x))LayerNorm(x+Sublayer(x)), where Sublayer(x)Sublayer(x) is the function implemented by the sub-layer itself
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        #create activation and biases initialised as 1 and 0 respectively
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
    
    

In [100]:
# now create the residual connection 
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """

    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        #randomly zeroes some of the elements of the input tensor for a given probability 
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))
    
#help(nn.Dropout)
    

In [101]:
#now build the encoder class
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"

    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)
    
        

# Decoder
contains stack of N=6 identical layers

In [102]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."

    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [103]:
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"

    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)

    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        #see paper diagram but you have the 2nd set of nodes of the encoder feeding into the second set of notes of the decoder - then 1 more set of decoder nodes afterwards
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)
    

        

In [104]:
#mask to stop the attention prediction depending on outputs at positions greater than i
#idea being you have a self attention sub layer (the first layer) in the decoder but you need to make sure that
#predictions can only depend on the previous i words
#essentially if you are trying to translate the 4th word then you can't use what's already in that position to predict what should be translated there.


def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
        torch.uint8
    )
    return subsequent_mask == 0

In [105]:
# show the attention mask
#i.e. the positions each tgt (target) word is allowed to look at

def example_mask():
    LS_data = pd.concat(
        [
            pd.DataFrame(
                {
                    "Subsequent Mask": subsequent_mask(20)[0][x, y].flatten(),
                    "Window": y,
                    "Masking": x,
                }
            )
            for y in range(20)
            for x in range(20)
        ]
    )

    return (
        alt.Chart(LS_data)
        .mark_rect()
        .properties(height=250, width=250)
        .encode(
            alt.X("Window:O"),
            alt.Y("Masking:O"),
            alt.Color("Subsequent Mask:Q", scale=alt.Scale(scheme="viridis")),
        )
        .interactive()
    )

#Below the attention mask shows the position each tgt word (row) is allowed to look at (column). Words are blocked for attending to future words during training.
show_example(example_mask)

# Attention

1.  "encoder-decoder" attention layers - query is from the previous decoder layer and memory keys and avlues from the output of the encoder - means every position in the decoder can attend over all positions in the input sequence.
2.  self attention encoder - key, value, query from the previous layer.
3.  self attention decoder - ditto above except prevent leftward information flow in decoder - this is done usign the mask. 

In [106]:
# create attention function

def attention(query, key, value, mask =None, dropout=None):
    "Comput 'scaled dot product attention"
    #refers to size of last dimension .size(-1) does
    d_k = query.size(-1)
    
    scores = torch.matmul(query, key.transpose(-2,-1)) / math.sqrt(d_k)
    
    if mask is not None:
        #replaces with -1e9 - numerical infinity
        scores = scores.masked_fill(mask == 0 , -1e9)
    
    
    #soft max over the last dimension of the matrix
    p_attn = scores.softmax(dim=-1)
    
    if dropout is not None:
        p_attn = dropout(p_attn)
    
    return torch.matmul(p_attn, value), p_attn

In [107]:
# multiheaded atentions allows model to view information from different representation subspaces at different positiosn - these are then averaged 

class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        #assert dimensions of model are a multiple of h, the number of heads 
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = [
            lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for lin, x in zip(self.linears, (query, key, value))
        ]

        # 2) Apply attention on all the projected vectors in batch.
        x, self.attn = attention(
            query, key, value, mask=mask, dropout=self.dropout
        )

        # 3) "Concat" using a view and apply a final linear.
        x = (
            x.transpose(1, 2)
            .contiguous()
            .view(nbatches, -1, self.h * self.d_k)
        )
        del query
        del key
        del value
        return self.linears[-1](x)

# Position-wise Feed-Foward Networks

Feeds worward with two linear transformation and a ReLU activation in between - this is so the neural network can't be linearly collapsed

In [108]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu()))

# Embeddings and Softmax  



In [109]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

# Positional Encoding 
So we retain the location of a word as well as the attention that word applied to all of the words within a corpus. 

In [110]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        
        #arrange is similar to a linspace function 
        #the divisor of the arguement of the trig function has a log applied ot it
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

In [111]:
def example_positional():
    pe = PositionalEncoding(20, 0)
    y = pe.forward(torch.zeros(1, 100, 20))

    data = pd.concat(
        [
            pd.DataFrame(
                {
                    "embedding": y[0, :, dim],
                    "dimension": dim,
                    "position": list(range(100)),
                }
            )
            for dim in [0,3,6,9]
        ]
    )

    return (
        alt.Chart(data)
        .mark_line()
        .properties(width=800)
        .encode(x="position", y="embedding", color="dimension:N")
        .interactive()
    )


show_example(example_positional)

#  Define model hyperparameters

In [112]:
#d_ff - inner layer dimensionality 

# #nn.init.xavier_uniform_: 
# Fills the input `Tensor` with values according to the method
# described in `Understanding the difficulty of training deep feedforward
# neural networks` - Glorot, X. & Bengio, Y. (2010), using a uniform
# distribution. The resulting tensor will have values sampled from
# :math:`\mathcal{U}(-a, a)`

def make_model(
    src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1
):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab),
    )

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model


# Inference testing 
- generate predictions of the model based on an untrained input - hence the results will be random

In [128]:
def inference_test(output_words = 9):
    test_model = make_model(src_vocab =11, tgt_vocab = 11, N =2)
    
    #show the model architecture
    test_model.eval()
    
    src = torch.LongTensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
    src_mask = torch.ones(1, 1, 10)
    
    memory = test_model.encode(src, src_mask)
    #print("memory shape: {}".format(memory.shape))
    #variable to store prediction
    ys = torch.zeros(1, 1).type_as(src)
    
    
    #loop through 
    for i in range(output_words -1):
        out = test_model.decode(
            memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src.data)
        )
        
        prob = test_model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim =1)
        next_word = next_word.data[0]
        #fill_ = Fills self tensor with the specified value.
        ys = torch.cat(
            [ys, torch.empty(1, 1).type_as(src.data).fill_(next_word)], dim =1
        )

    print("Example Untrained Model Prediction:", ys)    

inference_test(output_words = 2)




Example Untrained Model Prediction: tensor([[ 0, 10]])


In [129]:
def run_tests(loops = 10, output_words = 9):
    for _ in range(loops):
        inference_test(output_words)
        
(run_tests(5,5))        

Example Untrained Model Prediction: tensor([[ 0, 10,  7,  6,  0]])
Example Untrained Model Prediction: tensor([[0, 5, 4, 2, 4]])
Example Untrained Model Prediction: tensor([[0, 9, 1, 2, 5]])
Example Untrained Model Prediction: tensor([[0, 5, 6, 7, 2]])
Example Untrained Model Prediction: tensor([[0, 0, 0, 0, 0]])


# Model training