In [5]:
import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
import math

In [6]:
# nn.Module is the base class for every model
# this class keeps track of other sub modules, i.e conv layers
# or transformer layers
# as well as the loss and forward pass layers
class TransformerModule(nn.Module):
    
    def __init__(self, ntoken: int, d_model: int, nhead: int, 
                 d_hid: int, nlayers: int, droupout: float = 0.5):
        # ntokens - number of unique tokens (including special tokens such as padding or eos
        #
        # d_model - input vector dimension, i.e. the dimension the vocabulary will be mapped to
        # this is what is learned when training, either through masking or seq-to-seq or some other task
        #
        # nhead - number of attention heads, this takes an input in three parameters
        # Query, Key and Value, each token has 3 vectors in each of these vector spaces.
        # These are computed by passing the intial seq embedding through 3 seperate linear layers, on for Q, K and V respectively
        # where each linear layer is of size Emb x Emb
        # the self attention mechanism then calculates scores by taking the dot product of the Query, and Key vectors
        # This results in a vector of size |T| (# of tokens in a given input)
        # A softmax function is applied, this determines the weight of each tokens value vector
        # The weighted sum of the value vectors forms the output for each input token
        # In multi-headed attention, these matricies are split across each attention head
        # Then an attention score is computed for each head
        # More details can be found: https://towardsdatascience.com/transformers-explained-visually-part-3-multi-head-attention-deep-dive-1c1ff1024853
        #
        # d_hid - dimension of hidden layers, i.e. the feed forward net after the attention step
        # n_layers - # of encoding layers
        # dropout - rate of dropout applied to the output of each sublayer
        
        super().__init__() # needed for multi-class inheritance
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, droupout)
        self.transformer_encoder = TransformerEncoder(encode_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, n_token)
        # This is used to transform the output of the model into a vector of size n_token, which is
        # the probability of the next token in the sequence
        
        self.init_weights()
        
        def init_weights(self) -> None:
            initrange = 0.1
            # Intialize embedding with uniform weights form -0.1 to 0.1
            # .data allows access to the underlying tensor of the nn.Embedding class
            # .uniform_ will modify the weights in place
            self.embedding.weight.data.uniform_(-initrange, initrange)
            # set linear bias to zero in-place
            self.linear.bias.zero_()
            # intialize linear weights in-place
            self.linear.weight.data.uniform_(-initrange, initrange)
            
        # This function defines the forward pass of the model,
        # that is, what steps it will take when data is input
        def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
            """ Arguments:
                src: Tensor, shape ``[seq_len, batch_size]``
                src_mask: Tensor, shape ``[seq_len, seq_len]`` - Used to mask so future tokens do not have influence in prediction

            Returns:
                output Tensor of shape ``[seq_len, batch_size, ntoken]``
            """
            
            # This is a heuristic defined in th original "Attention is all you need"
            # The embeddings typically have a variance of 1/d
            # This step will increase the variance to ~1
            # This will reduce the probability of a vanishing gradient
            src = self.embedding(src) * math.sqrt(self.d_model)
            # positional encoding
            src = self.pos_encoder(src)
            
            # Now pass through our nlayer encoder
            output = self.transformer_encoder(src, src_mask)
            output = self.linear(output)
            return output

In [7]:
# Since we have inheritance, we can change the super class and inherit the changes down the line
# This is what I will need to edit to get the same encoding that is used in 
# "musicautobot"

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        # intialize dropout layer
        self.dropout = nn.Dropout(p = dropout)
        
        # torch.arange(max_len) - produce 1D tensor that contains 0 to max_len -1
        # unsqueeze(1) will expand this to a 2D tensor of size [max_len, 1]
        # so the resulting tensor will look like: [[0], [1], ..., [max_len -1]]
        position = torch.arange(max_len).unsqueeze(1)
        
        
        # torch.arange(0, d_model, 2) - 1D tensor from 0 to d_model by steps of 2
        # math.log(10000.0) is a constant chosen for the original paper in order to create a series that decrease exponentially when multiplie by positoinal values
        # torch.exp() - apply exp to each value of the tensor
        # result = exp(-(2i/d_model))
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        
        # Intialize positional encoding
        pe = torch.zeros(max_len, 1, d_model)
        # Apply positoinal encoding to each position
        # Even positions
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        # Odd positions
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        
        # Create a buffer, that is a tensor that is part of a model but
        # does not have gradients that are to be updated during training
        self.register_buffer('pe', pe)
        
    def forward(self, x: Tensor) -> Tensor:
        '''Arguments:
            x: Tensor, shape: [seq_len, batch_size, embedding_size]
        '''
        # Add x to its computed positional encoding for a given sequence length
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)