In [1]:
import tensorflow as tf 
import numpy as np 

In [2]:
input_embedding = [[
                "salut" , "comment","ca","va" , "?"
]]

output_embedding = [[
                   "<START>" ,  "hi","how","are","you" , "?"
]]


In [3]:
def get_vocab(sentences):
  vocab = {}
  for sentence in sentences:
    for word in sentence :
      if word not in vocab:
        vocab[word] = len(vocab)

  return vocab

In [4]:
input_vocab = get_vocab(input_embedding)
output_vocab =  get_vocab(output_embedding)

In [5]:
input_vocab["<START>"] =  len(input_vocab)
input_vocab["<END>"] =  len(input_vocab)
input_vocab["<PAD>"] =  len(input_vocab)


output_vocab["<START>"] =  len(input_vocab)
output_vocab["<END>"] =  len(input_vocab)
output_vocab["<PAD>"] =  len(input_vocab)




In [6]:
input_vocab

{'<END>': 6,
 '<PAD>': 7,
 '<START>': 5,
 '?': 4,
 'ca': 2,
 'comment': 1,
 'salut': 0,
 'va': 3}

In [7]:
output_vocab

{'<END>': 8,
 '<PAD>': 8,
 '<START>': 8,
 '?': 5,
 'are': 3,
 'hi': 1,
 'how': 2,
 'you': 4}

In [8]:
def sequence_to_str(sequences, vocab):
  for sequence in sequences:
    for s,word in enumerate(sequence):
      sequence[s] = vocab[word]
  return np.array(sequences)

In [9]:
input_sequences =  sequence_to_str(input_embedding , input_vocab)
output_sequences  = sequence_to_str(output_embedding , output_vocab)

In [10]:
output_sequences

array([[8, 1, 2, 3, 4, 5]])

In [11]:
class Embedd(tf.keras.layers.Layer):
  def __init__(self,input_dim,output_dim,):
    super(Embedd , self).__init__() 
    self.emb = tf.keras.layers.Embedding(input_dim , output_dim)
  def call(self ,x):
    x =  self.emb(x)
  
    return x 

In [13]:
class ScaleDotProduct(tf.keras.layers.Layer):
  def __init__(self , units):
    super(ScaleDotProduct , self).__init__()
    self.q_layers =  tf.keras.layers.Dense(units)
    self.k_layers =  tf.keras.layers.Dense(units)
    self.v_layers =  tf.keras.layers.Dense(units)
  
  def call(self ,x):
    Q =  self.q_layers(x)
    K =  self.k_layers(x)
    V = self.v_layers(x)
  
    QK =  tf.matmul(Q,K , transpose_b=True)
    QK = QK / tf.math.sqrt(256.)
    softmax_QK = tf.nn.softmax(QK, axis=-1)
    attention = tf.matmul(softmax_QK, V)
    
    return attention 

In [17]:
#Part for the Multiple Head of Attention 

In [20]:
#Multi head Attention


class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self,units ,dim=256 ,nb_head=8):
    super(MultiHeadAttention , self).__init__() 
    self.head_dim = dim//nb_head
    self.dim =  dim
    self.nb_head =  nb_head
    self.q_layers =  tf.keras.layers.Dense(units)
    self.k_layers =  tf.keras.layers.Dense(units)
    self.v_layers =  tf.keras.layers.Dense(units)
    self.out_proj = tf.keras.layers.Dense(units)
  
  def call(self ,x):
    Q =  self.q_layers(x)
    K =  self.k_layers(x)
    V = self.v_layers(x)


 
    batch_size = tf.shape(Q)[0]
    seq_len = tf.shape(Q)[1] 

    Q = tf.reshape(Q , [batch_size , seq_len , self.nb_head , self.head_dim])
    K = tf.reshape(K , [batch_size , seq_len , self.nb_head , self.head_dim])
    V = tf.reshape(V , [batch_size , seq_len , self.nb_head , self.head_dim])

  

    Q =  tf.transpose(Q , [0 , 2,1,3])
    K =  tf.transpose(K , [0 , 2,1,3])
    V =  tf.transpose(V , [0 , 2,1,3])
    
  
    Q = tf.reshape(Q , [batch_size*self.nb_head , seq_len , self.head_dim])
    K = tf.reshape(K , [batch_size*self.nb_head , seq_len , self.head_dim])
    V = tf.reshape(V , [batch_size*self.nb_head , seq_len , self.head_dim])
 

    QK =  tf.matmul(Q,K , transpose_b=True)
    QK = QK / tf.math.sqrt(256.)
    softmax_QK = tf.nn.softmax(QK, axis=-1)
    attention = tf.matmul(softmax_QK, V)
    attention = tf.reshape(attention , [batch_size , self.nb_head , seq_len , self.head_dim])

    attention = tf.transpose(attention , [0,2,1,3])
    #The Concat 
    attention = tf.reshape(attention , [batch_size  , seq_len , self.nb_head*self.head_dim])

    out_attention = self.out_proj(attention)


    return out_attention

In [21]:
class EncodeLayer(tf.keras.layers.Layer):
  def __init__(self,units):
    super(EncodeLayer , self).__init__()
    self.mult_head_attention =  MultiHeadAttention(units)
    self.norm = tf.keras.layers.BatchNormalization()
    self.dense = tf.keras.layers.Dense(units)
    
  def call(self ,x):
    attention=  self.mult_head_attention(x)
    post_attention = self.norm(x+attention)
    x =  self.dense(post_attention)
    enc_output = self.norm(x +post_attention)
  
    return enc_output

In [22]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self , units , nb_encoder):
    super(Encoder , self).__init__()
    self.nb_encoder =  nb_encoder
    self.units = units
  
  def build(self , input_shape):
    super().build(input_shape)

    self.encoder_layers =[]

    for nb in range(self.nb_encoder):
      self.encoder_layers.append(EncodeLayer(self.units))
 
  def call(self ,x):

    for encoder_layer in self.encoder_layers:
      x = encoder_layer(x)
       
    return x

In [28]:
class MyModel(tf.keras.Model):
  def __init__(self,units , input_dim,output_dim , nb_encoder):
    super(MyModel ,self ).__init__()
    self.emb = Embedd(input_dim , output_dim)
    self.encoder = Encoder(units , nb_encoder)
    
  def call(self,x):
    embedd =  self.emb(x)
    encoder_output = self.encoder(embedd)

    return encoder_output 


In [29]:
model = MyModel(256,5,256,6)

In [30]:
y =model(input_sequences)

In [31]:
y

<tf.Tensor: shape=(1, 5, 256), dtype=float32, numpy=
array([[[-0.45936322, -1.5659549 , -0.48583964, ...,  1.3093379 ,
          0.3229911 ,  0.8611849 ],
        [-0.30107084, -1.8425404 , -1.0650803 , ...,  0.9380076 ,
          0.64954585,  0.71157306],
        [-0.16058953, -1.5436242 , -0.965643  , ...,  1.154702  ,
          0.49532196,  0.68222964],
        [-0.4332395 , -1.6017998 , -0.6330609 , ...,  1.141011  ,
          0.4485056 ,  1.0755699 ],
        [-0.5741957 , -1.2223752 , -1.0310805 , ...,  1.1801189 ,
          1.0313195 ,  0.8484405 ]]], dtype=float32)>

In [27]:
model.summary()

Model: "my_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedd_2 (Embedd)            multiple                  1280      
_________________________________________________________________
encoder (Encoder)            multiple                  1979904   
Total params: 1,981,184
Trainable params: 1,978,112
Non-trainable params: 3,072
_________________________________________________________________
