In [361]:
import tensorflow as tf
import numpy as np

In [371]:
input_enc=[["salut","comment","ca","va","?"]]
output_dec=[["<start>","how","are","you","?"]]

In [372]:
def get_vocabulary(sequences):
    info={}
    for sequence in sequences:
        for word in sequence:
            if word not in info:
                info[word]=len(info)
    return info           

In [374]:
input_voc=get_vocabulary(input_enc)
output_voc=get_vocabulary(output_dec)

In [375]:
input_voc["<END>"]=len(input_voc)
input_voc["<PAD>"]=len(input_voc)
output_voc["<END>"]=len(input_voc)
output_voc["<PAD>"]=len(input_voc)

In [376]:
input_voc
output_voc

{'<start>': 0, 'how': 1, 'are': 2, 'you': 3, '?': 4, '<END>': 7, '<PAD>': 7}

In [377]:
def sequences_to_int(sequences,voc):
    for sequence in sequences:
        for s, word in enumerate(sequence):
            sequence[s]=voc[word]
    return np.array(sequences)        

In [379]:
input_seq=sequences_to_int(input_enc,input_voc)
output_seq=sequences_to_int(output_dec,output_voc)

In [380]:
class EmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self,nb_token,**kwargs):
        self.nb_token=nb_token
        super(**kwargs).__init__()
        
    def build(self,input_shape):
        self.word_embedding=tf.keras.layers.Embedding(
            self.nb_token,256
        )
        super().build(input_shape)

    def call(self,x):
        embed=self.word_embedding(x)
        return embed

class ScaleDotProductAttention(tf.keras.layers.Layer):
    def __init__(self,**kwargs):
        super(**kwargs).__init__()
        
    def build(self,input_shape):
        self.query_layer=tf.keras.layers.Dense(256)
        self.value_layer=tf.keras.layers.Dense(256)
        self.key_layer=tf.keras.layers.Dense(256)
        super().build(input_shape)

    def call(self,x):
        Q=self.query_layer(x)
        K=self.key_layer(x)
        V=self.value_layer(x)
        QK=tf.matmul(Q,K,transpose_b=True)
        QK=QK/tf.math.sqrt(256.)
        print(QK.shape)
        softmax_QK=tf.nn.softmax(QK,axis=-1)
        attention=tf.matmul(softmax_QK,V)
        print(Q.shape,K.shape,V.shape)
        print(softmax_QK.shape)
        print(attention.shape)
        
        
        return attention
def test():
    layer_input=tf.keras.Input(shape=(5,))
    embedding=EmbeddingLayer(nb_token=5)(layer_input)
    attention=ScaleDotProductAttention()(embedding)
    model=tf.keras.Model(layer_input,attention)
    model.summary()
    return model

In [382]:
m_test=test()
m_test(input_seq)

(None, 5, 5)
(None, 5, 256) (None, 5, 256) (None, 5, 256)
(None, 5, 5)
(None, 5, 256)


(1, 5, 5)
(1, 5, 256) (1, 5, 256) (1, 5, 256)
(1, 5, 5)
(1, 5, 256)


<tf.Tensor: shape=(1, 5, 256), dtype=float32, numpy=
array([[[-0.02105662, -0.0090131 , -0.00352302, ..., -0.01409042,
          0.00509213,  0.00772291],
        [-0.02106722, -0.00902023, -0.0035157 , ..., -0.01407184,
          0.0050868 ,  0.0077153 ],
        [-0.02107277, -0.00901308, -0.00350858, ..., -0.01407661,
          0.00508425,  0.00772282],
        [-0.0210785 , -0.0090243 , -0.0035174 , ..., -0.01406507,
          0.00508105,  0.00771324],
        [-0.02107246, -0.00901838, -0.00353265, ..., -0.01408172,
          0.0050823 ,  0.00770827]]], dtype=float32)>

In [383]:
m_test=test()
m_test(output_seq)

(None, 5, 5)
(None, 5, 256) (None, 5, 256) (None, 5, 256)
(None, 5, 5)
(None, 5, 256)


(1, 5, 5)
(1, 5, 256) (1, 5, 256) (1, 5, 256)
(1, 5, 5)
(1, 5, 256)


<tf.Tensor: shape=(1, 5, 256), dtype=float32, numpy=
array([[[-0.01239456,  0.02060347, -0.02956191, ...,  0.02589235,
          0.00893511,  0.00756682],
        [-0.01239261,  0.02060903, -0.02957984, ...,  0.02590458,
          0.00895069,  0.00756892],
        [-0.01237612,  0.02061018, -0.02957685, ...,  0.02591803,
          0.00894982,  0.00756137],
        [-0.01238872,  0.0206048 , -0.02957831, ...,  0.02590983,
          0.00895107,  0.00757344],
        [-0.01240411,  0.02060894, -0.02959112, ...,  0.02591   ,
          0.00895775,  0.00755713]]], dtype=float32)>

In [476]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self,dim=256,nb_head=8,**kwargs):
        self.head_dim=256//8
        self.nb_head=8
        super(**kwargs).__init__()
        
    def build(self,input_shape):
        self.query_layer=tf.keras.layers.Dense(256)
        self.value_layer=tf.keras.layers.Dense(256)
        self.key_layer=tf.keras.layers.Dense(256)
        self.out_proj=tf.keras.layers.Dense(256)
        super().build(input_shape)

    def mask_softmax(self,x,mask):
        
        x_expe=tf.math.exp(x)
        x_expe_masked=x_expe*mask
        x_expe_sum=tf.reduce_sum(x_expe_masked,axis=-1)
        x_expe_sum=tf.expand_dims(x_expe_sum,axis=-1)
        softmax=x_expe_masked/x_expe_sum
        return softmax
        

    def call(self,x,mask=None):
        in_query,in_key,in_value=x
        Q=self.query_layer(in_query)
        K=self.key_layer(in_key)
        V=self.value_layer(in_value)

        batch_size=tf.shape(Q)[0]
        seq_len=tf.shape(Q)[1]
        Q=tf.reshape(Q,[batch_size,seq_len,self.nb_head,self.head_dim])
        K=tf.reshape(K,[batch_size,seq_len,self.nb_head,self.head_dim])
        V=tf.reshape(V,[batch_size,seq_len,self.nb_head,self.head_dim])
        

        Q=tf.transpose(Q,[0,2,1,3])
        K=tf.transpose(K,[0,2,1,3])
        V=tf.transpose(V,[0,2,1,3])
        

        Q=tf.reshape(Q,[batch_size,self.nb_head,seq_len,self.head_dim])
        K=tf.reshape(K,[batch_size,self.nb_head,seq_len,self.head_dim])
        V=tf.reshape(V,[batch_size,self.nb_head,seq_len,self.head_dim])

        QK=tf.matmul(Q,K,transpose_b=True)
        QK=QK/tf.math.sqrt(256.)
        if mask is  None:
           print("mask is nicht none")
           QK=QK*mask
           softmax_QK=self.mask_softmax(QK,mask)
        else:   
            print("maskist none ")
            softmax_QK=tf.nn.softmax(QK,axis=-1)
        attention=tf.matmul(softmax_QK,V)
       
 
        #attention=tf.reshape(attention,[batch_size,self.nb_head,seq_len,self.head_dim])
        attention=tf.transpose(attention,[0,2,1,3])
        attention=tf.reshape(attention,[batch_size,seq_len,self.nb_head*self.head_dim])
        out_attention=self.out_proj(attention)
        return out_attention
        
def test():
    layer_input=tf.keras.Input(shape=(5,))
    embedding=EmbeddingLayer(nb_token=5)(layer_input)
    mask=tf.sequence_mask(tf.range(5)+1,5)
    mask=tf.cast(mask,tf.float32)
    mask=tf.expand_dims(mask,axis=0)
    print(mask)
    multi_attention=MultiHeadAttention()((embedding,embedding,embedding),mask=mask)
    model=tf.keras.Model(layer_input,multi_attention)
    model.summary()
    return model                

In [477]:
m_test=test()
m_test(output_seq)

tf.Tensor(
[[[1. 0. 0. 0. 0.]
  [1. 1. 0. 0. 0.]
  [1. 1. 1. 0. 0.]
  [1. 1. 1. 1. 0.]
  [1. 1. 1. 1. 1.]]], shape=(1, 5, 5), dtype=float32)
maskist none 


maskist none 


<tf.Tensor: shape=(1, 5, 256), dtype=float32, numpy=
array([[[ 0.01119382, -0.00286951,  0.0020979 , ...,  0.00541455,
          0.00075622, -0.00443292],
        [ 0.01119975, -0.00286778,  0.00209857, ...,  0.00540536,
          0.00075633, -0.00442867],
        [ 0.01119804, -0.0028688 ,  0.00209862, ...,  0.00541878,
          0.00075786, -0.00442984],
        [ 0.01120211, -0.00287401,  0.00209684, ...,  0.0054167 ,
          0.00075027, -0.00442868],
        [ 0.01119722, -0.00287137,  0.00209653, ...,  0.00540912,
          0.00075458, -0.00443327]]], dtype=float32)>

In [478]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self,**kwargs):
        
        super(**kwargs).__init__()
        
    def build(self,input_shape):
        self.Multi_Head_Attention=MultiHeadAttention()
        self.norm=tf.keras.layers.LayerNormalization()
        self.dense_out=tf.keras.layers.Dense(256)
        super().build(input_shape)

    def call(self,x):
        Attention=self.Multi_Head_Attention((x,x,x),mask=None)
        post_attention=self.norm(Attention+x)
        x=self.dense_out(post_attention)
        enc_outpout=self.norm(x)
        return enc_outpout
def test():
    layer_input=tf.keras.Input(shape=(5,))
    embedding=EmbeddingLayer(nb_token=5)(layer_input)
    encoder_output=EncoderLayer()(embedding)
    model=tf.keras.Model(layer_input,encoder_output)
    model.summary()
    return model  

In [479]:
m_test=test()
m_test(input_seq)

maskist none 


maskist none 


<tf.Tensor: shape=(1, 5, 256), dtype=float32, numpy=
array([[[-0.9167078 ,  0.6062617 , -0.40561503, ..., -0.05410853,
          0.18980093, -0.61908853],
        [-0.7759391 , -0.1575514 ,  0.7830168 , ...,  0.29896405,
         -0.7507426 , -0.13852909],
        [-0.7289455 , -1.1257945 , -1.8043578 , ...,  1.4883896 ,
          0.63597053, -0.80819106],
        [-0.6418523 ,  1.2017398 , -0.3853525 , ...,  1.7340305 ,
         -0.29089862,  1.6064694 ],
        [-0.64185834, -0.37207463,  1.1540389 , ..., -0.9990005 ,
         -2.979611  ,  0.697902  ]]], dtype=float32)>

In [480]:

class Encoder(tf.keras.layers.Layer):
    def __init__(self,nb_encoder,**kwargs):
        self.nb_encoder=nb_encoder
        super(**kwargs).__init__()
        
    def build(self,input_shape):
        self.encoder_layer=[]
        for nb in range(self.nb_encoder):
            self.encoder_layer.append(EncoderLayer())
        super().build(input_shape)
    def call(self,x):
        for encoder in self.encoder_layer:
            x=encoder(x)
        
        return x
def test():
    layer_input=tf.keras.Input(shape=(5,))
    embedding=EmbeddingLayer(nb_token=5)(layer_input)
    encoder_output=Encoder(nb_encoder=6)(embedding)
    model=tf.keras.Model(layer_input,encoder_output)
    model.summary()
    return model  

In [481]:
m_test=test()
m_test(output_seq)

maskist none 
maskist none 
maskist none 
maskist none 
maskist none 
maskist none 


maskist none 
maskist none 
maskist none 
maskist none 
maskist none 
maskist none 


<tf.Tensor: shape=(1, 5, 256), dtype=float32, numpy=
array([[[ 0.82232475,  0.07334855,  1.2059599 , ...,  1.6438099 ,
         -1.3942479 ,  0.5748099 ],
        [ 0.7830262 ,  0.250861  ,  1.329698  , ...,  0.7860988 ,
         -1.2463847 ,  0.85204244],
        [ 0.40288574,  0.0809675 ,  0.9678191 , ...,  1.0446928 ,
         -1.138141  ,  0.8597219 ],
        [ 0.5745945 , -0.4108683 ,  1.1179868 , ...,  1.304013  ,
         -1.5553342 ,  0.32353407],
        [ 0.4951214 , -0.2518857 ,  1.1717849 , ...,  1.3070445 ,
         -1.408482  ,  0.4200267 ]]], dtype=float32)>

In [563]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self,**kwargs):
        
        super(**kwargs).__init__()
        
    def build(self,input_shape):
        self.Multi_Head_Attention=MultiHeadAttention()
        self.Multi_Head_croise_Attention=MultiHeadAttention()
        self.norm=tf.keras.layers.LayerNormalization()
        self.proj_output=tf.keras.layers.Dense(256)
        
        super().build(input_shape)

    def call(self,x):
        decoder_embedding,encoder_output,mask=x
        self_attention=self.Multi_Head_Attention((decoder_embedding,decoder_embedding,decoder_embedding),mask)
        post_self_attention=self.norm(decoder_embedding+self_attention)
        croise_attention=self.Multi_Head_croise_Attention((post_self_attention,encoder_output,encoder_output))
        proj_out=self.proj_output(croise_attention)
        dec_output=self.norm(proj_out+croise_attention)
        return dec_output
def test():
    layer_input=tf.keras.Input(shape=(5,))
    embedding=EmbeddingLayer(nb_token=5)(layer_input)
    encoder_output=EncoderLayer()(embedding)
    model=tf.keras.Model(layer_input,encoder_output)
    model.summary()
    return model 

In [574]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self,nb_decoder,**kwargs):
        self.nb_decoder=nb_decoder
        super(**kwargs).__init__()
        
    def build(self,input_shape):
        self.decoder_layer=[]
        for nb in range(self.nb_decoder):
            self.decoder_layer.append(DecoderLayer())
        super().build(input_shape)
    def call(self,x):
        decoder_embedding,encoder_output,mask=x
        dec_output=encoder_output
        for decoder in self.decoder_layer:
            dec_output= DecoderLayer()((decoder_embedding,dec_output,mask))
        
        return dec_output
def test(output_voc):
    input_token=tf.keras.Input(shape=(5,))
    output_token=tf.keras.Input(shape=(5,))
    
    encoder_pos=EmbeddingLayer(nb_token=5)(tf.range(5))
    decoder_pos=EmbeddingLayer(nb_token=5)(tf.range(5))
    
    encoder_embedding=EmbeddingLayer(nb_token=5)(input_token)
    decoder_embedding=EmbeddingLayer(nb_token=5)(output_token)

    encoder_embedding=encoder_embedding+encoder_pos
    decoder_embedding=encoder_embedding+decoder_pos
    
    encoder_output=Encoder(nb_encoder=6)(encoder_embedding)
    
    mask=tf.sequence_mask(tf.range(5)+1,5)
    mask=tf.cast(mask,tf.float32)
    mask=tf.expand_dims(mask,axis=0)
    
    decoder_output=Decoder(nb_decoder=5)((decoder_embedding,encoder_output,mask))
    
    out=tf.keras.layers.Dense(len(output_voc))( decoder_output)
    model=tf.keras.Model([input_token,output_token],encoder_output)
    model.summary()
    return model

In [575]:
input_seq

<tf.Tensor: shape=(1, 5), dtype=float32, numpy=array([[0., 1., 2., 3., 4.]], dtype=float32)>

In [576]:
input_seq = tf.convert_to_tensor(input_seq, dtype=tf.float32)
output_seq = tf.convert_to_tensor(output_seq, dtype=tf.float32)
m_test

<Functional name=functional_101, built=True>

In [577]:
m_test=test(output_voc)


maskist none 
maskist none 
maskist none 
maskist none 
maskist none 
maskist none 
maskist none 
maskist none 
maskist none 
maskist none 
maskist none 
maskist none 
maskist none 
maskist none 
maskist none 
maskist none 


In [None]:
m_test((input_embedding,output_embedding))

In [568]:
out=tf.keras.layers.Dense(8)(dec_output)

0