Modelling

In [2]:
import tensorflow as tf
import keras
import numpy as np
import pandas as pd


################################# COMPONENTS ######################################


class MultiheadAttention(tf.keras.layers.Layer):

  def __init__(self,d_model,d_vector,heads):
    super(MultiheadAttention, self).__init__()

    self.d_model=d_model
    self.d_vector=d_vector
    self.heads=heads

    self.Wk = tf.keras.layers.Dense(d_vector) ### => (Batch_size x sequence_length x d_vector)

    ### Theory states for every input sequence of dimension (sequence_length x d_model) (Xi),
    ### we multiply by a weight Wk, Wq, Wv to obtain Key, Query and Value vectors of dimension (sequence_length x vector_dimension), where the weights are trainable parameters
    ### For batch learning the shape becomes (batch_size x sequence_length x d_model)

    ### In a dense layer, there is a kernel which is essentially a weight matrix (Wk), multiplied by the input, to create the feature vector.
    ### normally, we enter 2D shapes to a dense layer : (batch_size x input_vector_length) which multiplied by kernel (input_vector_length x feature_vector_length(no. of nodes -> 64 here))
    ### becomes (batch_size x feature vector length)

    ### But the input here is Xi of 3D dimension of (batch_size x sequence_length x d_model) => In case, we send data with rank more than 2 to a dense layer
    ### it flattens the data -> ((batch_size * sequence_length) x d_model) -> performs the multiplication to create -> ((batch_size * sequence_length) x vector_dimension)
    ### And then reconverts it into 3D -> (batch_size x sequence_length x vector_dimension) => So, we get the vectors.

    self.Wq = tf.keras.layers.Dense(d_vector)
    self.Wv = tf.keras.layers.Dense(d_vector)

    self.Wo = tf.keras.layers.Dense(d_model)

    ### The final_layer



  def dot_product_attention(self, Query, Key, Value, Mask: None):
    # Mask only for decoder

    KQ_dot_product = tf.matmul(Key, Query, transpose_b = True) # K.Q

    # K:(Sequence_length x vector_dimension)
    # Q:(Sequence_length x vector_dimension) ## For each input for each head for each input in batch

    ### K and Q: (Batch_size x head x Sequence_Length x Vector_dimension)
    ### Q after transpose: (Batch_size x head x Vector_dimension x Sequence Length)

    ### Note: For any N-Dimensional matrix multiplication the last 2 dimensions are switched and operations are there. So,

    ### For 4D matrices : last index of the first matrix should match the second last index of the second matrix.

    ### K.Q => (Batch_size x head x Sequence_Length x Sequence Length) => For an element of a batch for each head -> attention of all the other words of the sequence for each word of the sequence

    Normalized_dot_product = KQ_dot_product/ np.sqrt(vector_dimensions) ## Normalized: K.Q / sqrt(dk)
    ### Normalized_dot_product => (Batch_size x head x Sequence_Length x Sequence_Length)

    if Mask is not None:
      Normalized_dot_product = Normalized_dot_product + (1. - Mask) *(-1e9) ## For decoder is mask is there, we make it 0.
      ### Mask dimension -> (Batch_size x head x Sequence_Length x Sequence_Length)

    Attention_weights = tf.nn.softmax(Normalized_dot_product, axis = -1)
    ### Attention weights -> (Batch_size x head x Sequence_Length x Sequence_Length) -> the last axis contains the importance of all the other words on the word
    ### Softmax sums everything upto 1


    Attention_layer_output = tf.matmul(Attention_weights, Value)
    ### V: (Batch_size x Sequence_Length x Vector_dimension)

    ### Multiplication: (Batch_size x head x Sequence_Length x Sequence_Length) x (Batch_size x Sequence_Length x Vector_dimension)
    ### => The last axis of first matrix and second-last axis of the second matrix (As explained above) => Resultant (Batch_size x head x Sequence_Length x Vector_dimension)

    ### For 2D =>  (Sequence_Length x Sequence_Length) x (Sequence_Length x Vector_dimension) => (Sequence_Length x Vector_dimension)
    ### Result => The final Value vector for each word in the sequence

    ### Attention_layer_output => (Batch_size x head x Sequence_Length x Vector_dimension)
    ### Above result for the full batch for each head

    return Attention_layer_output

  def project_to_heads(self, Query, Key, Value):

    ## Query, value, key Shape: (Batch size x sequence length x vector_dimension)

    ## Projecting the above tensors for each head -> We need the shape -> (Batch_size x Head x sequence length x vector_dimension)

    ## We can only add an extra dimension on the last axis, as, dimension adding actually projects the key, query and value vectors in the added axis
    ## in such a way that all the components adds up to give the initial query, key and value vector. so, reshaping to add heads to all the vectors

    Query = tf.reshape(Query, shape=(Query.shape[0], Query.shape[1], No_of_heads, -1))
    Key = tf.reshape(Key, shape=(Key.shape[0], Key.shape[1], No_of_heads, -1))
    Value = tf.reshape(Value, shape=(Value.shape[0], Value.shape[1], No_of_heads, -1))

    ## Obtained shape => (Batch_size x sequence_length x head x vector_dimension)

    ### As each of them are different projections on different planes of the head axis, all the vectors are different.

    ### Transposing to required shape

    Query = tf.transpose(Query, perm=[0,2,1,3])
    Key = tf.transpose(Key, perm=[0,2,1,3])
    Value = tf.transpose(Value, perm=[0,2,1,3])

    ## Obtained shape => (Batch_size x head x sequence_length x vector_dimension)

    return Query, Key, Value


  def call(self,Query_input,Key_input,Value_input,Mask):

      ### We need Xi to create Ki,Qi and Vi so, why we are taking 3 different values here?
      ### For encoder, it is true, each of the 3 input is Xi only, but for decoder, there are 3 cases,
      ### where,
      ### 1. Query and key inputs come from encoder feedback and value comes from the pervious decoder layer and so on. So, the value inputs can be different.


      ## Finding all the vectors Query, key and value vectors: (Batch_size x sequence_length x vector_dimension)
      Q = self.Wq(Query_input)
      K = self.Wk(Key_input)
      V = self.Wv(Value_input)

      ### Projecting to create values for all the heads.
      Q_projected,K_projected,V_projected = self.project_to_heads(Q,K,V);

      ### Applying attention

      Attention_output = self.dot_product_attention(Q_projected, K_projected, V_projected, Mask)
      ### (Batch_size x head x sequence_length x vector_dimension)

      ### Now, we have weighted value vectors from all the heads, so, need to merge all of them first,
      ### So, we will do reverse of what we used to seperate them, for all heads -> we will add up all the projections.

      ### First transpose:
      Attention_out_transposed = tf.transpose(Attention_output, perm=[0,2,1,3])
      ### Dimension: (Batch_size x sequence_length x head x vector_dimension) -> We have all the projected vectors on the penultimate axis -> now we merge
      Attention_out_reshaped = tf.reshape(Attention_out_transposed, shape=(Attention_out_transposed.shape[0],Attention_out_transposed.shape[1],-1))
      ### Dimension: (Batch_size x sequence_length x vector_dimension)


      ### We have to produce output for the layer as: (Batch_size x sequence_length x d_model) as that's the output for every layer, and learn the attention from all heads,

      ## So, we use another learnable weight:

      Output = self.Wo(Attention_out_reshaped)

      ### Shape:  (Batch_size x sequence_length x d_model) as we have already explained and Wo is initialized with d_model.

      return Output



class FeedForwardLayer(tf.keras.layers.Layer):

  def __init__(self,d_model,d_ff):
    super(FeedForwardLayer, self).__init__()

    self.d_model = d_model
    self.d_ff = d_ff
    self.layer_1=tf.keras.layers.Dense(d_ff, activation= "relu")
    self.layer_2=tf.keras.layers.Dense(d_model)


  def call(self, input):

    out_1 = self.layer_1(input)
    out_2 = self.layer_2(out_1)

    return out_2

################################ ENCODER #####################################

class Encoder_module(tf.keras.layers.Layer):

  def __init__(self, d_model, d_ff, d_vector, heads):
    super(Encoder_module, self).__init__()

    self.attention_layer = MultiheadAttention(d_model,d_vector,heads)
    self.feed_forward_layer = FeedForwardLayer(d_model,d_ff)
    self.addition_layer = tf.keras.layers.Add()
    self.normalization_layer = tf.keras.layers.BatchNormalization()
    self.dropout = tf.keras.layers.Dropout(0.1)

    ## Dropout only applied during training.


  def call(self, input, padding_mask, training = False):

    ## For encoder all 3 heads of the attention recieves the same input.
    ### Padding mask is required as the sequence length should be same, padded 0's will be there, and can get considered in loss function

    out_attention = self.attention_layer(input,input,input, padding_mask)
    ## No mask in encoder

    add_attention = self.addition_layer([input, out_attention])
    norm_attention = self.normalization_layer(add_attention)

    attention_dropout = self.dropout(norm_attention, training = training)

    out_ff_layer = self.feed_forward_layer(attention_dropout)

    add_ff_layer = self.addition_layer([attention_dropout, out_ff_layer])
    norm_ff_layer = self.normalization_layer(add_ff_layer)

    ff_dropout = self.dropout(norm_ff_layer, training = training)

    return ff_dropout


class Encoder(tf.keras.layers.Layer):

  def __init__(self, d_model, d_ff, d_vector, heads, No_of_encoders):
    super(Encoder, self).__init__()

    self.encoders = [Encoder_module(d_model,d_ff,d_vector, heads) for i in range(No_of_encoders)]

  def call(self, input, mask = None, training = False):

    x = input
    for encoder in self.encoders:
      x = encoder(x,mask,training = training)

    return x

#################################################################################

################################# DECODER #######################################



class Decoder_module(tf.keras.layers.Layer):

  def __init__(self, d_model, d_ff, d_vector, heads):
    super(Decoder_module, self).__init__()

    self.attention_layer_input_from_previous_decoder_layer = MultiheadAttention(d_model,d_vector,heads)
    self.attention_layer_input_from_encoder_output = MultiheadAttention(d_model,d_vector,heads)

    ### In decoder module we have 2 attention layer: 1 where all query, key and value comes from the last decoder layer, and attention attends to the last
    ### decoder layer only.
    ### The other layer, where query and key comes from encoder output and value from decoder, and attention attends accordingly.

    self.feed_forward_layer = FeedForwardLayer(d_model,d_ff)
    self.addition_layer = tf.keras.layers.Add()
    self.normalization_layer = tf.keras.layers.BatchNormalization()
    self.dropout = tf.keras.layers.Dropout(0.1)


  def call(self, input, encoder_output, padding_mask, look_ahead_mask, training = False):

    ### In decoders masking will be used for each self attention,

    ### In decoders two types of masks are used:
    ### 1. The first attention layer accepts the input sentence itself while training as input to the decoder, there we need to add look-ahead mask, so
    ### the model does not attend to the succeeding word tokens

    ### 2. The second attention layer accepts input from the encoder output which has 0 padding, so padding mask is used there to remove the 0s from contributing
    ### to the loss function

    out_attention_1 = self.attention_layer_input_from_previous_decoder_layer(input,input,input,look_ahead_mask)

    add_attention = self.addition_layer([input, out_attention_1])
    norm_attention = self.normalization_layer(add_attention)

    attention_dropout = self.dropout(norm_attention, training = training)

    out_attention_2 = self.attention_layer_input_from_encoder_output(input,encoder_output,encoder_output,padding_mask)

    add_attention_2 = self.addition_layer([attention_dropout, out_attention_2])
    norm_attention_2 = self.normalization_layer(add_attention_2)

    attention_dropout_2 = self.dropout(norm_attention_2, training = training)

    out_ff_layer = self.feed_forward_layer(attention_dropout_2)

    add_ff_layer = self.addition_layer([attention_dropout_2, out_ff_layer])
    norm_ff_layer = self.normalization_layer(add_ff_layer)

    ff_dropout = self.dropout(norm_ff_layer, training = training)

    return ff_dropout


class Decoder(tf.keras.layers.Layer):

  def __init__(self, d_model, d_ff, d_vector, heads, No_of_decoders):
    super(Decoder, self).__init__()

    self.decoders = [Decoder_module(d_model,d_ff,d_vector, heads) for i in range(No_of_decoders)]

  def call(self, input, encoder_output, padding_mask, look_ahead_mask, training = False):

    x = input
    for decoder in self.decoders:
      x = decoder(x,encoder_output,padding_mask,look_ahead_mask, training)

    return x


###########################################################################################################

################################## POSITIONAL ENCODING ########################################

class Positional_Encoding():

  def __init__(self, d, sequence_length):

    self.d = d
    self.sequence_length = sequence_length
    self.n = 10000 ## User defined variable -> 10000 decided by the authors


  def get_positional_embeddings(self):

    Positional_embedding_matrix = np.zeros(shape=(self.sequence_length, self.d))

    ### d = d_model => postion_embedding gets added to the read embedding of the word, so size of the matrices should be same.
    ### So, a d_model length vector for all the elements in the sequence.

    for k in range(self.sequence_length):   #### k represents the postion in the sequence -> varies from [0,1,2.......... sequence_length-1]
      for i in range(int(self.d/2)):  ### Projects the embedding value of k to dimension d based on the equations.
        Positional_embedding_matrix[k,2*i] = np.sin(k/(np.power(self.n,2*i/self.d)))
        Positional_embedding_matrix[k,2*i+1] = np.cos(k/(np.power(self.n,2*i/self.d)))

    return Positional_embedding_matrix

#############################################################################################################


###################################### FULL TRANSFORMER ################################################

class Transformer(tf.keras.Model):

  def __init__(self, d_model, d_ff, d_vector, heads, No_of_encoders, No_of_decoders, sequence_length, vocab_size):
    super(Transformer, self).__init__()

    self.d_model = d_model
    self.d_ff = d_ff
    self.d_vector = d_vector
    self.heads = heads
    self.No_of_decoders = No_of_decoders
    self.No_of_encoders = No_of_encoders
    self.sequence_length = sequence_length
    self.encoder = Encoder(d_model,d_ff,d_vector, heads, No_of_encoders)
    self.decoder = Decoder(d_model,d_ff,d_vector, heads, No_of_decoders)
    self.position_encoding_vector = Positional_Encoding(d_model, sequence_length)
    self.embedding_layer = tf.keras.layers.Embedding(vocab_size, d_model)
    self.dropout = tf.keras.layers.Dropout(0.1)
    self.dense = tf.keras.layers.Dense(vocab_size, activation = "softmax")

    #### Transforming the d_model vector to vocab_size vector, so that we get the predicted word using softmax


    ### Embedding layer takes the input vector with size l and provides a random unique vector of a defined length, which serves as the unique representation
    ### of the given word.


  def call(self, input, output, look_ahead_mask = None, encoder_padding_mask = None, decoder_padding_mask = None, training = False):

    embedding_input = self.embedding_layer(input)

    position_embeddings_for_batch = np.array([self.position_encoding_vector.get_positional_embeddings() for _ in range(input.shape[0])])

    embedding_with_positional_encoding_input = embedding_input + tf.cast(position_embeddings_for_batch,dtype="float32")

    embedding_output = self.embedding_layer(output)

    embedding_with_positional_encoding_output = embedding_output + tf.cast(position_embeddings_for_batch,dtype="float32")

    encoder_output = self.encoder(embedding_with_positional_encoding_input, encoder_padding_mask, training)

    encoder_output_dropout = self.dropout(encoder_output, training = training)

    decoder_output = self.decoder(embedding_with_positional_encoding_output, encoder_output_dropout, decoder_padding_mask, look_ahead_mask, training)

    decoder_output_dropout = self.dropout(decoder_output, training = training)

    prediction = self.dense(decoder_output_dropout)

    return prediction



In [3]:
### Paramater assignments

d_model = 256;   # Each word embedding size (1 x 256)
No_of_encoders = 4;
No_of_decoders = 4;

d_ff = 512 # No of units in the internal Feed forward network layer

vector_dimensions = 64 # Q,V and K are of dimension (1 x 64) => Wq, Wv and Wk dimension (256 x 64)
No_of_heads = 4 # Multi head attention

sequence_length = 30

vocab_size = 100

batch_size = 64

transformer = Transformer(d_model,d_ff, vector_dimensions, No_of_heads, No_of_encoders, No_of_decoders, sequence_length, vocab_size)

In [4]:
random_input = np.random.rand(batch_size, sequence_length)
random_output = np.random.rand(batch_size, sequence_length)

pred = transformer(random_input, random_output)

transformer.summary()

Model: "transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder (Encoder)           multiple                  1319680   
                                                                 
 decoder (Decoder)           multiple                  1583616   
                                                                 
 embedding (Embedding)       multiple                  25600     
                                                                 
 dropout_8 (Dropout)         multiple                  0         
                                                                 
 dense_64 (Dense)            multiple                  25700     
                                                                 
Total params: 2,954,596
Trainable params: 2,950,500
Non-trainable params: 4,096
_________________________________________________________________


In [5]:
### Creating masks:

def create_look_ahead_mask(sequence_length):
    mask = tf.linalg.band_part(tf.ones((1, sequence_length, sequence_length)), -1, 0)
    return mask

"""
array([[[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]]],

"""

#### The mask creates a upper traingular matrix, which is substracted from 1 to reverse the 1's to 0's and 0's to -1's which are then added to make the dot product values
### miniumum.

def create_padding_mask(input):
    padding_mask = 1 - np.equal(input, 0)
    return tf.cast(padding_mask, dtype = "float32")

### If input is 0 the mask has value 0.