In [3]:
from tokenize import tokenize
from torch import nn
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import math
from transformers import AutoTokenizer


  from .autonotebook import tqdm as notebook_tqdm


## Multi Head Attention

In [100]:
class OneHeadSelfAttentionQKV(nn.Module):
    def __init__(self, k, low_dim):
        super().__init__()
        # Check if input is divisible by number of heads
        self.k = k    
        self.low_dim = low_dim 
        # 1. Define linear transformations to reduce dimensionnalité of input
        # biais = False because we want only weights
        self.to_reduce_dim = nn.Linear(k, low_dim, bias=False)
        # 2. Define linear transformations to key, queries and values
        # biais = False because we want only weights
        self.to_queries = nn.Linear(low_dim, low_dim, bias=False)
        self.to_keys    = nn.Linear(low_dim, low_dim, bias=False) 
        self.to_values  = nn.Linear(low_dim, low_dim, bias=False)

    def forward(self, Q, K, V, mask):
        # 3. Reduce dimensionnalité of input
        low_dim_Q = self.to_reduce_dim(Q)
        low_dim_K = self.to_reduce_dim(K)
        low_dim_V = self.to_reduce_dim(V)

        
        # 4. Apply the linear transformation associated to every input to obtain the key, query and value
        query = self.to_queries(low_dim_Q) 
        key = self.to_keys(low_dim_K)
        value = self.to_values(low_dim_V)

        # 5. Compute the raw weights w′ij=𝐪iT𝐤j and normalize them
        weights_raw = torch.bmm(query, key.transpose(1, 2))
        
        # 5.a apply mask
        if mask is not None:
            weights_raw = weights_raw.masked_fill_(mask.logical_not(), float("-1e20"))

        weights_raw_normalized = torch.div(weights_raw, torch.sqrt(torch.tensor(self.low_dim)))

        # 6. We apply the Softmax function to the similarity dimension (batch dim x input dim x sim dim)
        weights = nn.Softmax(dim=2)(weights_raw_normalized)

        # 7. Multiply weights of self attention to the values
        return torch.bmm(weights, value)
    

class MultiHeadSelfAttentionQKV(nn.Module):
    # 8.Define a head number that is divisible from the input 
    def __init__(self, k, heads=4):
        super().__init__()
        # Check if input is divisible by number of heads
        assert k % heads == 0

        self.k = k
        self.heads = heads  

        # 9. Instantiate OneHeadSelfAttention multiple times to have MultiHeadSelfAttention
        self.list_heads = []
        for head in range(self.heads):
            self.list_heads.append(OneHeadSelfAttentionQKV(k, k//heads))

        # This will be applied after the multi-head self-attention operation.
        self.unifyheads = nn.Linear(k, k)
    
    def forward(self, Q, K, V, mask=None):
        # 10. Get all heads elements 
        list_to_concat = []
        for one_head in self.list_heads:
            list_to_concat.append((one_head(Q, K, V, mask),))

        # 11. Concatenate all the heads
        multi_heads = sum(list_to_concat, ())        
        concatenated = torch.cat(multi_heads, dim=2)

        # 12. Linear transformation
        return self.unifyheads(concatenated)


In [101]:
X = torch.rand(12, 100, 256)
mask  = torch.tril(torch.ones((100, 100)), diagonal=1).bool()
M = MultiHeadSelfAttentionQKV(256, 4)
M(X, X, X, mask=mask).size()

torch.Size([12, 100, 256])

## Embeddings

In [102]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, dimension):
        super(Embedding, self).__init__()
        self.word_embedding = nn.Embedding(vocab_size, dimension)
        
    def forward(self, x):
        return self.word_embedding(x)

In [103]:
class PositionalEmbedding(nn.Module):
    def __init__(self, dimension, max_seq_length=2000):
        super(PositionalEmbedding, self).__init__()

        positional_encoding = torch.zeros(max_seq_length, dimension)    
        for pos in range(max_seq_length):
            for i in range(dimension):
                if i%2 == 0:
                    pe = math.sin(pos / 1000**(2*i/dimension))
                else:
                    pe = math.cos(pos / 1000**(2*i/dimension))
                positional_encoding[pos, i] = pe

        self.register_buffer('positional_encoding', positional_encoding)
        
    def forward(self, x):
        return x + self.positional_encoding[:x.size(1), :]

## Feed Forward

In [104]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim, factor=2):
        super(FeedForward, self).__init__()
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, factor*embed_dim),
            nn.ReLU(),
            nn.Linear(factor*embed_dim, embed_dim)
        )  
    
    def forward(self, x):
        return self.feed_forward(x)

<img src="../images/encoder.png" alt="drawing" width="200"/>

In [151]:
class TransformerEncodingBloc(nn.Module):
    def __init__(self, embedding_dim, num_heads, factor):
        super(TransformerEncodingBloc, self).__init__()
        # Mutli Head Attention with its notmalization and its dropout
        self.attention = MultiHeadSelfAttentionQKV(embedding_dim, num_heads)
        self.normalization_mha = nn.LayerNorm(embedding_dim)
        self.dropout_mha = nn.Dropout(0.2)

        # Feed Forward with its notmalization and its dropout
        self.feed_forward = FeedForward(embedding_dim, factor)
        self.normalization_ff = nn.LayerNorm(embedding_dim)
        self.dropout_ff = nn.Dropout(0.2)
        

    def forward(self, query, key, value):
        # Multi Head Attention
        mha = self.attention(query, key, value)
        mha_residuals = mha + value
        mha_residuals_norm = self.normalization_mha(mha_residuals)
        mha_residuals_norm_dropout = self.dropout_mha(mha_residuals_norm)

        # Feed Forward
        ff = self.feed_forward(mha_residuals_norm_dropout)
        ff_residuals = ff + mha_residuals_norm_dropout
        ff_residuals_norm = self.normalization_ff(ff_residuals)
        ff_residuals_norm_dropout = self.dropout_ff(ff_residuals_norm)

        return ff_residuals_norm_dropout


In [152]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, num_layers, factor):
        super(Encoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.positional_embedding = PositionalEmbedding(embedding_dim)
        self.transformer_layers = nn.ModuleList([TransformerEncodingBloc(embedding_dim, num_heads, factor) for i in range(num_layers)])

    def forward(self, X):
        encoded = self.embedding(X)
        output = self.positional_embedding(encoded)
        for transformer_layer in self.transformer_layers:
            output = transformer_layer(output, output, output)
        
        return output
        

In [153]:
X = torch.randint(low=0, high=100, size=(10, 100))

encoder = Encoder(vocab_size=5000, 
                  embedding_dim=256, 
                  num_heads=4,
                  #dropout=0.2,
                  num_layers=5,
                  factor=2
                  )

Y = encoder(X)

In [156]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
    "In a cold and gray Chicago morning, a poor little baby child is born in the ghetto",
    "I'll be there for you and the rain starts to pour",
    "I like big butts and I cannot lie",
]

encoded_input = tokenizer(batch_sentences, padding=True, return_tensors="pt")


encoded_input.input_ids.size()

torch.Size([6, 21])

In [155]:
encoder = Encoder(vocab_size=tokenizer.vocab_size, 
                  embedding_dim=256, 
                  num_heads=4,
                  num_layers=5,
                  factor=2
                  )

encoder(encoded_input.input_ids)

tensor([[[-6.3442e-02, -1.0470e-01, -1.9869e-01,  ...,  4.2391e-01,
          -2.6027e-01, -6.0122e-01],
         [ 5.6368e-04, -3.5754e-01,  1.3343e+00,  ..., -4.8053e-01,
           4.5416e-01, -6.9387e-02],
         [ 2.6932e-01,  2.6601e-01, -0.0000e+00,  ...,  1.2859e+00,
           0.0000e+00, -0.0000e+00],
         ...,
         [-1.6117e-01, -5.7511e-01, -0.0000e+00,  ...,  0.0000e+00,
           2.6810e-01, -3.5211e-01],
         [-2.9852e-02,  4.5394e-03, -1.4956e-01,  ...,  4.5028e-02,
          -1.5938e-01, -9.3726e-01],
         [ 0.0000e+00, -3.4367e-01, -1.9233e-01,  ...,  7.3635e-02,
          -4.2149e-01, -2.2788e-01]],

        [[ 1.9344e-02, -4.1531e-01, -1.2358e-02,  ...,  0.0000e+00,
           4.0751e+00, -1.0246e-01],
         [-0.0000e+00, -1.7516e-01, -0.0000e+00,  ...,  3.1647e-02,
          -4.1913e+00, -0.0000e+00],
         [ 4.6181e-02, -5.6651e+00, -2.8903e-01,  ...,  3.3248e-01,
          -2.3646e-01,  0.0000e+00],
         ...,
         [ 1.3179e+00, -5

<img src="../images/decoder.png" alt="drawing" width="200"/>

In [177]:
class TransformerDecodingBloc(nn.Module):
    def __init__(self, embedding_dim, num_heads, factor):
        super(TransformerDecodingBloc, self).__init__()
        # Mutli Head Self Attention with its notmalization and its dropout
        self.self_attention = MultiHeadSelfAttentionQKV(embedding_dim, num_heads)
        self.normalization_mhsa = nn.LayerNorm(embedding_dim)
        self.dropout_mhsa = nn.Dropout(0.2)

        # Mutli Head cross Attention with its notmalization and its dropout
        self.cross_attention = MultiHeadSelfAttentionQKV(embedding_dim, num_heads)
        self.normalization_mhca = nn.LayerNorm(embedding_dim)
        self.dropout_mhca = nn.Dropout(0.2)

        # Feed Forward with its notmalization and its dropout
        self.feed_forward = FeedForward(embedding_dim, factor)
        self.normalization_ff = nn.LayerNorm(embedding_dim)
        self.dropout_ff = nn.Dropout(0.2)
        

    def forward(self, x, encoder, mask):
        # Mutli Head Self Attention
        mhsa = self.self_attention(x, x, x, mask)
        mhsa_residuals = mhsa + x
        mhsa_residuals_norm = self.normalization_mhsa(mhsa_residuals)
        mhsa_residuals_norm_dropout = self.dropout_mhsa(mhsa_residuals_norm)

        # Mutli Head Cross Attention
        query = mhsa_residuals_norm_dropout
        mhca = self.cross_attention(query, encoder, encoder) # Query belongs to self attention and Key and Value from encoder
        mhca_residuals = mhca + query
        mhca_residuals_norm = self.normalization_mhca(mhca_residuals)
        mhca_residuals_norm_dropout = self.dropout_mhca(mhca_residuals_norm)

         # Feed Forward with its notmalization and its dropout
        ff = self.feed_forward(mhca_residuals_norm_dropout)
        ff_residuals = ff + mhca_residuals_norm_dropout
        ff_residuals_norm = self.normalization_ff(ff_residuals)
        ff_residuals_norm_dropout = self.dropout_ff(ff_residuals_norm)

        return ff_residuals_norm_dropout


In [178]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, num_layers, factor=1):
        super(Decoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.positional_embedding = PositionalEmbedding(embedding_dim)
        self.transformer_layers = nn.ModuleList([TransformerDecodingBloc(embedding_dim, num_heads, factor) for i in range(num_layers)])
        self.linear_out = nn.Linear(embedding_dim, vocab_size)
        self.dropout = nn.Dropout(0.2)


    def forward(self, x, encoder_out, mask=None):
        encoded = self.embedding(x)
        output = self.positional_embedding(encoded)
        output = self.dropout(output)
        for transformer_layer in self.transformer_layers:
            output = transformer_layer(output, encoder_out, mask)
            
        output = self.linear_out(output)
        
        return nn.Softmax(dim=2)(output)
        

In [179]:
class Transformer(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, embedding_dim, num_heads, num_layers, factor=2):
        super(Transformer, self).__init__()

        self.encoder = Encoder(input_vocab_size, embedding_dim, num_heads, num_layers, factor)
        self.decoder = Decoder(output_vocab_size, embedding_dim, num_heads, num_layers, factor)

    def get_mask_output(self, output):
        _, output_len = output.size()
        return torch.tril(torch.ones((output_len, output_len)), diagonal=1).bool()
    

    def forward(self, input, output):
        encoder = self.encoder(input) 
        mask = self.get_mask_output(output)
        return self.decoder(output, encoder, mask=mask) 


In [180]:
batch_input = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
    "In a cold and gray Chicago morning, a poor little baby child is born in the ghetto",
    "I'll be there for you and the rain starts to pour",
    "I like big butts and I cannot lie",
]

batch_output = [
    "Pourquoi pas",
    "Je ne pense pas",
    "Je ne sais pas",
    "J'ai passé la bague à Chikita, deux mois après je l'ai déjà quitté",
    "H et Kaamelot sont les séries françaises les plus marrantes",
    "j'entends des bruits sur mon téléphone sans fil",
]

tokenizer_input = AutoTokenizer.from_pretrained("bert-base-cased")
tokenizer_output = AutoTokenizer.from_pretrained("bert-base-cased")

encoded_input = tokenizer_input(batch_input, padding=True, return_tensors="pt")
print(encoded_input.input_ids.size())

encoded_output = tokenizer_output(batch_output, padding=True, return_tensors="pt")
print(encoded_output.input_ids.size())


torch.Size([6, 21])
torch.Size([6, 36])


In [181]:
tokenizer_output.vocab_size

28996

In [182]:
transformer = Transformer(input_vocab_size=tokenizer_input.vocab_size,
                          output_vocab_size=tokenizer_output.vocab_size,
                          embedding_dim=256, 
                          num_heads=4, 
                          num_layers=5, 
                          factor=2
                          )

In [183]:
a = transformer(encoded_input.input_ids, encoded_output.input_ids)

In [189]:
encoded_output.input_ids.size()

torch.Size([6, 36])

In [190]:
a.size()

torch.Size([6, 36, 28996])