In [34]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
import sys, os

## preparing data

In [19]:
# df = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-fr.csv'), engine='python')
# df.iloc[:1000].to_csv(os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-fr-small.csv'))

In [25]:
df = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-fr-small.csv'), engine='python', index_col = 0)

In [26]:
df

Unnamed: 0,en,fr
0,Changing Lives | Changing Society | How It Wor...,Il a transformÃƒÂ© notre vie | Il a transformÃ...
1,Site map,Plan du site
2,Feedback,RÃƒÂ©troaction
3,Credits,CrÃƒÂ©dits
4,FranÃƒÂ§ais,English
...,...,...
995,A nova is a star that absorbs matter from a ne...,La matiÃƒÂ¨re absorbÃƒÂ©e finit par rÃƒÂ©chauf...
996,It is a rare and spectacular event.,Il s'agit d'un phÃƒÂ©nomÃƒÂ¨ne rare et plutÃƒÂ...
997,"In 1977, he became the French editor of the Na...","En 1977, il devient l'ÃƒÂ©diteur francophone d..."
998,"In 1978, Lemay began the daunting task of asse...","En 1978, il entreprend la tÃƒÂ¢che colossale d..."


In [40]:
class PrepareDataset:
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.n_sentences = 1000  # Number of sentences to include in the dataset
        self.train_split = 0.9  # Ratio of the training data split

    # Fit a tokenizer
    def create_tokenizer(self, dataset):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(dataset)

        return tokenizer

    def find_seq_length(self, dataset):
        return max(len(seq.split()) for seq in dataset)

    def find_vocab_size(self, tokenizer, dataset):
        tokenizer.fit_on_texts(dataset)

        return len(tokenizer.word_index) + 1

    def __call__(self, filename = 'en-fr-small.csv', **kwargs):
        # Load a clean dataset
        clean_dataset = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'data', filename), engine='python', index_col = 0).values

        # Reduce dataset size
        dataset = clean_dataset[:self.n_sentences, :]

        # Include start and end of string tokens
        for i in range(dataset[:, 0].size):
            dataset[i, 0] = "<START> " + dataset[i, 0] + " <END>"
            dataset[i, 1] = "<START> " + dataset[i, 1] + " <END>"

        # Random shuffle the dataset
        np.random.shuffle(dataset)

        # Split the dataset
        train = dataset[:int(self.n_sentences * self.train_split)]

        # Prepare tokenizer for the encoder input
        enc_tokenizer = self.create_tokenizer(train[:, 0])
        enc_seq_length = self.find_seq_length(train[:, 0])
        enc_vocab_size = self.find_vocab_size(enc_tokenizer, train[:, 0])

        # Encode and pad the input sequences
        trainX = enc_tokenizer.texts_to_sequences(train[:, 0])
        trainX = pad_sequences(trainX, maxlen=enc_seq_length, padding='post')
        # trainX = convert_to_tensor(trainX, dtype=int64)
        trainX = tf.constant(trainX)

        # Prepare tokenizer for the decoder input
        dec_tokenizer = self.create_tokenizer(train[:, 1])
        dec_seq_length = self.find_seq_length(train[:, 1])
        dec_vocab_size = self.find_vocab_size(dec_tokenizer, train[:, 1])

        # Encode and pad the input sequences
        trainY = dec_tokenizer.texts_to_sequences(train[:, 1])
        trainY = pad_sequences(trainY, maxlen=dec_seq_length, padding='post')
        # trainY = convert_to_tensor(trainY, dtype=int64)
        trainY = tf.constant(trainY)

        return dataset, trainX, trainY, train, enc_seq_length, dec_seq_length, enc_vocab_size, dec_vocab_size, enc_tokenizer, dec_tokenizer

In [41]:
prep_data = PrepareDataset()
dataset, trainX, trainY, train, enc_seq_length, dec_seq_length, enc_vocab_size, dec_vocab_size, enc_tokenizer, dec_tokenizer = prep_data()

In [60]:
trainX[0]

<tf.Tensor: shape=(61,), dtype=int32, numpy=
array([  3,   5, 335,  15, 285,   1,  10,  37,   1,  18,   5,  19,   7,
       397,  28, 220, 398,   2,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0])>

In [45]:
trainY[0]

<tf.Tensor: shape=(63,), dtype=int32, numpy=
array([  2,   4, 332,  27, 384,  16,  13,   7,  36,  15,  21,   6, 818,
        51, 385, 333,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0])>

In [43]:
dataset[0]

array(['<START> In 1991, for example, the observatory became the first in Canada to use an infrared camera. <END>',
       "<START> En 1991, par exemple, l'observatoire est le premier au Canada ÃƒÂ\xa0 disposer d'une camÃƒÂ©ra infrarouge. <END>"],
      dtype=object)

In [57]:
enc_tokenizer.sequences_to_texts(trainX[tf.newaxis, 0].numpy())

['start in 1991 for example the observatory became the first in canada to use an infrared camera end']

In [59]:
dec_tokenizer.sequences_to_texts(trainY[tf.newaxis, 0].numpy())

["start en 1991 par exemple l'observatoire est le premier au canada ãƒâ\xa0 disposer d'une camãƒâ©ra infrarouge end"]

## step by step calculations for transformer

In [33]:
inputs = tf.constant(np.random.rand(2, 2)) # (bs, in_seq_len)
target = tf.constant(np.random.rand(2, 3)) # (bs, tg_seq_len)
encoder_mask = tf.cast(tf.math.not_equal(inputs, 0), tf.float32) # (bs, in_seq_len)
decoder_mask = tf.linalg.band_part(tf.ones((tf.shape(target)[0], tf.shape(target)[1])), -1, 0) # (bs, tg_seq_len)
encoder_decoder_mask = tf.cast(tf.math.not_equal(inputs, 0), tf.float32) # (bs, tg_seq_len)
print(encoder_decoder_mask.shape)

encoder_mask = encoder_mask[:, tf.newaxis, tf.newaxis, :]
encoder_decoder_mask = encoder_decoder_mask[:, tf.newaxis, tf.newaxis, :]
decoder_mask = decoder_mask[:, tf.newaxis, tf.newaxis, :]

tfm = Transformer(
    num_blocks = 5,
    d_model = 20,
    num_heads = 5,
    hidden_dim = 30,
    source_vocab_size = 40,
    target_vocab_size = 40,
    max_input_len = inputs.shape[1],
    max_target_len = target.shape[1]
)

enc = tfm.encoder
dec = tfm.decoder

(2, 2)


In [34]:
training = True
# inputs.shape = (batch_size, seq_len)
seq_len = tf.shape(inputs)[1]
inputs = enc.embedding(inputs) 
inputs *= tf.math.sqrt(tf.cast(enc.d_model, tf.float32))
inputs += enc.pos_encoding[:, :seq_len, :]
inputs = enc.dropout(inputs, training=training) # inputs.shape = (batch_size, seq_len, embed_dim)

for block in enc.blocks:
    # mha_output, attn_weights = self.mha(inputs, inputs, inputs, mask)
    query = key = value = inputs
    mask = encoder_mask
    # query, key and value have shape (batch_size, inp_seq_len, embed_dim)
    # mask has shape (batch_size, 1, 1, inp_seq_len)
    # if mask has shape (batch_size, inp_seq_len) we can change it to proper shape by writing
    # mask = mask[:, tf.newaxis, tf.newaxis, :]

    query = block.mha.wq(query) # qs, ks and vs have shape (batch_size, inp_seq_len, d_model)
    key = block.mha.wk(key)
    value = block.mha.wv(value)
    
    query = block.mha.split_heads(query) # qs, ks and vs have shape (batch_size, num_heads, inp_seq_len, d_head)
    key = block.mha.split_heads(key)
    value = block.mha.split_heads(value)

    # output, attn_weights = self.scaled_dot_product_attention(query, key, value, mask)
    # query, key and value have shape (batch_size, num_heads, inp_seq_len, d_head)
    # mask has shape (batch_size, 1, 1, inp_seq_len)
    key_dim = tf.cast(tf.shape(key)[-1], tf.float32)
    # scaled_scores has shape (batch_size, num_heads, inp_seq_len, inp_seq_len)
    scaled_scores = tf.matmul(query, key, transpose_b=True) / np.sqrt(key_dim)

    if mask is not None:
        scaled_scores = tf.where(mask==0, -np.inf, scaled_scores)

    softmax = tf.keras.layers.Softmax()
    weights = softmax(scaled_scores) 
    # output of scaled_dot_product_attention
    output = tf.matmul(weights, value)
    
    output = block.mha.merge_heads(output) # output.shape = (batch_size, inp_seq_len, d_model)
    output = block.mha.dense(output) # output.shape = (batch_size, inp_seq_len, d_model)
    mha_output = output
    
    mha_output = block.dropout1(mha_output, training=training)
    mha_output = block.layernorm1(inputs + mha_output)

    ffn_output = block.ffn(mha_output)
    ffn_output = block.dropout2(ffn_output, training=training)
    output = block.layernorm2(mha_output + ffn_output)

    inputs = output
    
encoder_output = inputs

In [35]:
scaled_scores.shape

TensorShape([2, 5, 2, 2])

In [29]:
encoder_output.shape

TensorShape([2, 2, 20])

In [36]:
# encoder_output.shape = (batch_size, inp_seq_len, embed_dim)
# target.shape = (batch_size, targ_seq_len)
# decoder_mask.shape = encoder_decoder_mask.shape = (batch_size, 1, 1, targ_seq_len)
seq_len = tf.shape(target)[1]
target = dec.embedding(target)
target *= tf.math.sqrt(tf.cast(dec.d_model, tf.float32))
target += dec.pos_encoding[:, :seq_len, :]
target = dec.dropout(target, training=training) # .shape = (batch_size, targ_seq_len, embed_dim)

for block in dec.blocks:
    # encoder_output.shape = (batch_size, targ_seq_len, embed_dim)
    # target.shape = (batch_size, tar_seq_len, embed_dim)
    # decoder_mask.shape = encoder_decoder_mask.shape = (batch_size, 1, 1, tar_seq_len)
    # encoder_decoder_mask.shape = encoder_decoder_mask.shape = (batch_size, 1, 1, targ_seq_len)

    # mha_output1, attn_weights = block.mha1(target, target, target, decoder_mask)
    query = key = value = target
    mask = decoder_mask
    # query, key and value have shape (batch_size, targ_seq_len, embed_dim)
    # mask has shape (batch_size, 1, 1, targ_seq_len)

    query = block.mha1.wq(query) # qs, ks and vs have shape (batch_size, targ_seq_len, d_model)
    key = block.mha1.wk(key)
    value = block.mha1.wv(value)
    
    query = block.mha1.split_heads(query) # qs, ks and vs have shape (batch_size, num_heads, targ_seq_len, d_head)
    key = block.mha1.split_heads(key)
    value = block.mha1.split_heads(value)

    # output, attn_weights = self.scaled_dot_product_attention(query, key, value, mask)
    # query, key and value have shape (batch_size, num_heads, targ_seq_len, d_head)
    # mask has shape (batch_size, 1, 1, targ_seq_len)
    key_dim = tf.cast(tf.shape(key)[-1], tf.float32)
    # scaled_scores has shape (batch_size, num_heads, targ_seq_len, targ_seq_len)
    scaled_scores = tf.matmul(query, key, transpose_b=True) / np.sqrt(key_dim)

    if mask is not None:
        scaled_scores = tf.where(mask==0, -np.inf, scaled_scores)

    softmax = tf.keras.layers.Softmax()
    weights = softmax(scaled_scores) 
    # output of scaled_dot_product_attention
    output = tf.matmul(weights, value)
    
    output = block.mha1.merge_heads(output) # output.shape = (batch_size, targ_seq_len, d_model)
    output = block.mha1.dense(output) # output.shape = (batch_size, targ_seq_len, d_model)
    mha_output1 = output
    
    mha_output1 = block.dropout1(mha_output1, training=training)
    mha_output1 = block.layernorm1(mha_output1 + target) # mha_output1.shape = (batch_size, tar_seq_len, embed_dim)

    # mha_output2, attn_weights = block.mha2(mha_output1, encoder_output, encoder_output, encoder_decoder_mask)
    query = mha_output1
    key = value = encoder_output
    mask = encoder_decoder_mask
    # query, key and value have shape (batch_size, targ_seq_len, embed_dim)
    # mask has shape (batch_size, 1, 1, targ_seq_len)

    query = block.mha2.wq(query) # qs, ks and vs have shape (batch_size, targ_seq_len, d_model)
    key = block.mha2.wk(key)
    value = block.mha2.wv(value)
    
    query = block.mha2.split_heads(query) # qs, ks and vs have shape (batch_size, num_heads, targ_seq_len, d_head)
    key = block.mha2.split_heads(key)
    value = block.mha2.split_heads(value)

    # output, attn_weights = self.scaled_dot_product_attention(query, key, value, mask)
    # query, key and value have shape (batch_size, num_heads, targ_seq_len, d_head)
    # mask has shape (batch_size, 1, 1, targ_seq_len)
    key_dim = tf.cast(tf.shape(key)[-1], tf.float32)
    # scaled_scores has shape (batch_size, num_heads, targ_seq_len, targ_seq_len)
    scaled_scores = tf.matmul(query, key, transpose_b=True) / np.sqrt(key_dim)

    if mask is not None:
        scaled_scores = tf.where(mask==0, -np.inf, scaled_scores)

    softmax = tf.keras.layers.Softmax()
    weights = softmax(scaled_scores) 
    # output of scaled_dot_product_attention
    output = tf.matmul(weights, value)
    
    output = block.mha2.merge_heads(output) # output.shape = (batch_size, targ_seq_len, d_model)
    output = block.mha2.dense(output) # output.shape = (batch_size, targ_seq_len, d_model)
    mha_output2 = output
    
    mha_output2 = block.dropout2(mha_output2, training=training)
    mha_output2 = block.layernorm2(mha_output2 + mha_output1) # mha_output2.shape = (batch_size, tar_seq_len, embed_dim)

    ffn_output = block.ffn(mha_output2)
    ffn_output = block.dropout3(ffn_output, training=training)
    output = block.layernorm3(ffn_output + mha_output2) # output.shape = (batch_size, seq_len, embed_dim)

    target = output

decoder_output = target

In [37]:
decoder_output.shape

TensorShape([2, 3, 20])

In [38]:
transformer_output = tfm.output_layer(decoder_output)

In [39]:
transformer_output.shape

TensorShape([2, 3, 40])

## multi head attention

In [2]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        assert(d_model % num_heads == 0)

        self.d_head = d_model // num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        # Linear layer to generate the final output.
        self.dense = tf.keras.layers.Dense(d_model)
  
    def split_heads(self, x):
        batch_size = x.shape[0]

        split_inputs = tf.reshape(x, (batch_size, -1, self.num_heads, self.d_head))
        return tf.transpose(split_inputs, perm=[0, 2, 1, 3])
  
    def merge_heads(self, x):
        batch_size = x.shape[0]

        merged_inputs = tf.transpose(x, perm=[0, 2, 1, 3])
        return tf.reshape(merged_inputs, (batch_size, -1, self.d_model))

    def call(self, query, key, value, mask = None):
        # query, key and value have shape (batch_size, seq_len, embed_dim)
        # mask has shape (batch_size, 1, 1, seq_len)
        # if mask has shape (batch_size, seq_len) we can change it to proper shape by writing
        # mask = mask[:, tf.newaxis, tf.newaxis, :]
        
        qs = self.wq(query) # qs, ks and vs have shape (batch_size, seq_len, d_model)
        ks = self.wk(key)
        vs = self.wv(value)

        qs = self.split_heads(qs) # qs, ks and vs have shape (batch_size, num_heads, seq_len, d_head)
        ks = self.split_heads(ks)
        vs = self.split_heads(vs)

        output, attn_weights = self.scaled_dot_product_attention(qs, ks, vs, mask)
        output = self.merge_heads(output) # output.shape = (batch_size, seq_len, d_model)
        output = self.dense(output) # output.shape = (batch_size, seq_len, d_model)
        
        return output, attn_weights
    
    def scaled_dot_product_attention(self, query, key, value, mask=None):
        # query, key and value have shape (batch_size, num_heads, seq_len, d_head)
        # mask has shape (batch_size, 1, 1, seq_len)
        
        key_dim = tf.cast(tf.shape(key)[-1], tf.float32)
        # scaled_scores has shape (batch_size, num_heads, seq_len, seq_len)
        scaled_scores = tf.matmul(query, key, transpose_b=True) / np.sqrt(key_dim)

        if mask is not None:
            scaled_scores = tf.where(mask==0, -np.inf, scaled_scores)

        softmax = tf.keras.layers.Softmax()
        weights = softmax(scaled_scores) 
        output = tf.matmul(weights, value)
        return output, weights

In [25]:
x = tf.constant([[1,2,0], [1,0,0]])
query = key = value = x[:,:,tf.newaxis]
mask = tf.cast(tf.math.not_equal(x, 0), tf.float32) # mask.shape = (batch_size, seq_len)
mask = mask[:, tf.newaxis, tf.newaxis, :]
mha = MultiHeadAttention(20, 5)

In [26]:
output, _ = mha(query, key, value, mask)

In [45]:
output.shape

TensorShape([2, 3, 20])

## encoder

In [3]:
class EncoderBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, hidden_dim, dropout_rate=0.1):
        super().__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = self.feed_forward_network(d_model, hidden_dim)

        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

        self.layernorm1 = tf.keras.layers.LayerNormalization()
        self.layernorm2 = tf.keras.layers.LayerNormalization()
  
    def call(self, inputs, training, mask = None):
        # x.shape = (batch_size, seq_len, embed_dim)
        mha_output, attn_weights = self.mha(inputs, inputs, inputs, mask)
        mha_output = self.dropout1(mha_output, training=training)
        mha_output = self.layernorm1(inputs + mha_output)

        ffn_output = self.ffn(mha_output)
        ffn_output = self.dropout2(ffn_output, training=training)
        output = self.layernorm2(mha_output + ffn_output)

        return output, attn_weights
    
    def feed_forward_network(self, d_model, hidden_dim):
        return tf.keras.Sequential([
          tf.keras.layers.Dense(hidden_dim, activation='relu'),
          tf.keras.layers.Dense(d_model)
        ])

In [4]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_blocks, d_model, num_heads, hidden_dim, src_vocab_size,
               max_seq_len, dropout_rate = 0.1):
        super().__init__()

        self.d_model = d_model
        self.max_seq_len = max_seq_len

        self.embedding = tf.keras.layers.Embedding(src_vocab_size, d_model)
        self.pos_encoding = self.positional_encoding(max_seq_len, d_model)

        # The original Attention Is All You Need paper applied dropout to the
        # input before feeding it to the first encoder block.
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

        # Create encoder blocks.
        self.blocks = [
            EncoderBlock(self.d_model, num_heads, hidden_dim, dropout_rate)
            for _ in range(num_blocks)
        ]
  
    def call(self, inputs, training, mask = None):
        # inputs.shape = (batch_size, seq_len)
        seq_len = tf.shape(inputs)[1]
        inputs = self.embedding(inputs) 
        inputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        inputs += self.pos_encoding[:, :seq_len, :]
        inputs = self.dropout(inputs, training=training) # inputs.shape = (batch_size, seq_len, embed_dim)

        for block in self.blocks:
            inputs, weights = block(inputs, training, mask = mask)

        return inputs, weights
    
    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(np.arange(position)[:, np.newaxis],
                                     np.arange(d_model)[np.newaxis, :],
                                     d_model)
        sines = np.sin(angle_rads[:, 0::2])
        cosines = np.cos(angle_rads[:, 1::2])

        pos_encoding = np.concatenate([sines, cosines], axis=-1)
        pos_encoding = pos_encoding[np.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)

    def get_angles(self, positions, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return positions * angle_rates

In [44]:
# x represents 2 sentences
x = tf.constant([[1, 2, 0], [1, 0, 0]]) # x.shape = (batch_size, seq_len)
mask = tf.cast(tf.math.not_equal(x, 0), tf.float32) # mask.shape = (batch_size, seq_len)
mask = mask[:, tf.newaxis, tf.newaxis, :]

encoder = Encoder(
    num_blocks = 2, 
    d_model = 20, 
    num_heads = 5, 
    hidden_dim = 30, 
    src_vocab_size = 10,
    max_seq_len = 3
)
output, weights = encoder(x, False, mask)

In [45]:
output.shape

TensorShape([2, 3, 20])

## decoder

In [5]:
class DecoderBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, hidden_dim, dropout_rate=0.1):
        super().__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = self.feed_forward_network(d_model, hidden_dim)

        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout3 = tf.keras.layers.Dropout(dropout_rate)

        self.layernorm1 = tf.keras.layers.LayerNormalization()
        self.layernorm2 = tf.keras.layers.LayerNormalization()
        self.layernorm3 = tf.keras.layers.LayerNormalization()
  
    # Note the decoder block takes two masks. One for the first MHA, another
    # for the second MHA.
    def call(self, encoder_output, target, training, decoder_mask, encoder_decoder_mask):
        # encoder_output.shape = (batch_size, inp_seq_len, embed_dim)
        # target.shape = (batch_size, tar_seq_len, embed_dim)
        # decoder_mask.shape = encoder_decoder_mask.shape = (batch_size, 1, 1, tar_seq_len)
        # encoder_decoder_mask.shape = encoder_decoder_mask.shape = (batch_size, 1, 1, inp_seq_len)
        
        mha_output1, attn_weights = self.mha1(target, target, target, decoder_mask)
        mha_output1 = self.dropout1(mha_output1, training=training)
        mha_output1 = self.layernorm1(mha_output1 + target) # mha_output1.shape = (batch_size, tar_seq_len, embed_dim)

        mha_output2, attn_weights = self.mha2(mha_output1, encoder_output, encoder_output, encoder_decoder_mask)
        mha_output2 = self.dropout2(mha_output2, training=training)
        mha_output2 = self.layernorm2(mha_output2 + mha_output1) # mha_output2.shape = (batch_size, tar_seq_len, embed_dim)

        ffn_output = self.ffn(mha_output2)
        ffn_output = self.dropout3(ffn_output, training=training)
        output = self.layernorm3(ffn_output + mha_output2) # output.shape = (batch_size, seq_len, embed_dim)

        return output, attn_weights
    
    def feed_forward_network(self, d_model, hidden_dim):
        return tf.keras.Sequential([
          tf.keras.layers.Dense(hidden_dim, activation='relu'),
          tf.keras.layers.Dense(d_model)
        ])

In [6]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_blocks, d_model, num_heads, hidden_dim, target_vocab_size,
                   max_targ_seq_len, dropout_rate=0.1):
        super().__init__()

        self.d_model = d_model

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = self.positional_encoding(max_targ_seq_len, d_model)

        self.dropout = tf.keras.layers.Dropout(dropout_rate)

        self.blocks = [DecoderBlock(self.d_model, num_heads, hidden_dim, dropout_rate) for _ in range(num_blocks)]
    
    def call(self, encoder_output, target, training, decoder_mask = None, encoder_decoder_mask = None):
        # encoder_output.shape = (batch_size, inp_seq_len, embed_dim)
        # target.shape = (batch_size, targ_seq_len)
        # decoder_mask.shape = encoder_decoder_mask.shape = (batch_size, 1, 1, targ_seq_len)
        seq_len = tf.shape(target)[1]
        target = self.embedding(target)
        target *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        target += self.pos_encoding[:, :seq_len, :]
        target = self.dropout(target, training=training) # .shape = (batch_size, targ_seq_len, embed_dim)

        for block in self.blocks:
            target, weights = block(encoder_output, target, training, decoder_mask, encoder_decoder_mask)

        return target, weights # x.shape = (batch_size, targ_seq_len, embed_dim)
    
    def positional_encoding(self, max_length, d_model):
        position = tf.range(max_length, dtype=tf.float32)[:, tf.newaxis]
        div_term = tf.pow(10000, (2 * tf.range(d_model // 2, dtype=tf.float32)) / d_model)
        div_term = div_term[tf.newaxis, :]

        # Compute the sine and cosine components
        angles = tf.matmul(position, div_term)
        pos_enc = tf.concat([tf.sin(angles), tf.cos(angles)], axis=-1)
        
        pos_enc = pos_enc[tf.newaxis, :]

        return pos_enc

## transformer

In [7]:
class Transformer(tf.keras.Model):
    def __init__(self, num_blocks, d_model, num_heads, hidden_dim, source_vocab_size,
                   target_vocab_size, max_input_len, max_target_len, dropout_rate=0.1):
        super().__init__()

        self.encoder = Encoder(num_blocks, d_model, num_heads, hidden_dim, source_vocab_size, 
                               max_input_len, dropout_rate)

        self.decoder = Decoder(num_blocks, d_model, num_heads, hidden_dim, target_vocab_size,
                               max_target_len, dropout_rate)

        # The final dense layer to generate logits from the decoder output.
        self.output_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inputs, targets, training, encoder_mask = None,
               decoder_mask = None, encoder_decoder_mask = None):
        encoder_output, encoder_attn_weights = self.encoder(inputs, 
                                                            training, encoder_mask)

        decoder_output, decoder_attn_weights = self.decoder(encoder_output, 
                                                            targets, training,
                                                            decoder_mask, encoder_decoder_mask)
        
        output = self.output_layer(decoder_output) # output.shape = (batch_size, seq_len, target_vocab_size)
        
        return output, encoder_attn_weights, decoder_attn_weights

In [45]:
inputs = tf.constant(np.random.rand(2, 2))
target = tf.constant(np.random.rand(2, 3))
encoder_mask = tf.cast(tf.math.not_equal(inputs, 0), tf.float32)
decoder_mask = tf.linalg.band_part(tf.ones((tf.shape(target)[0], tf.shape(target)[1])), -1, 0)
encoder_decoder_mask = tf.cast(tf.math.not_equal(inputs, 0), tf.float32)

encoder_mask = encoder_mask[:, tf.newaxis, tf.newaxis, :]
encoder_decoder_mask = encoder_decoder_mask[:, tf.newaxis, tf.newaxis, :]
decoder_mask = decoder_mask[:, tf.newaxis, tf.newaxis, :]

transformer = Transformer(
    num_blocks = 5,
    d_model = 20,
    num_heads = 5,
    hidden_dim = 30,
    source_vocab_size = 40,
    target_vocab_size = 40,
    max_input_len = inputs.shape[1],
    max_target_len = target.shape[1]
)

output, _, _ = transformer(inputs, target, True, encoder_mask, decoder_mask, encoder_decoder_mask)

In [46]:
output.shape

TensorShape([2, 3, 40])

## training

In [None]:
predictions = tf.constant([[[0., 1.], [0., 1.]], [[0., 1.], [1., 0.]]])
targets = tf.constant([[1,1], [1,0]])
loss_function(targets, predictions)

In [None]:
predictions = tf.constant([[0., 1.], [1., 0.]])
targets = tf.constant([1, 0])
loss_function(targets, predictions)

In [67]:
def train_step(inputs, targets, transformer, optimizer, loss_function, train_loss, train_accuracy):
    # Initialize the mask variables
    encoder_mask, encoder_decoder_mask, decoder_mask = create_masks(inputs, targets)
    # encoder_mask = decoder_mask = encoder_decoder_mask = None
  
    with tf.GradientTape() as tape:
        # Pass the inputs through the transformer
        # last word in targets is <end> token
        predictions, _, _ = transformer(inputs, targets[:, :-1], True, encoder_mask, decoder_mask, encoder_decoder_mask)
      
        # Calculate the loss
        # first word in targets is the <start> token
        loss = loss_function(targets[:, 1:], predictions)
    
    # Apply the gradients
    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  
    # Update the loss and accuracy
    train_loss(loss)
    train_accuracy(targets[:, 1:], predictions)

# Function to create masks
def create_masks(inputs, targets):
    encoder_mask = tf.cast(tf.math.not_equal(inputs, 0), tf.float32)
    decoder_mask = tf.linalg.band_part(tf.ones((tf.shape(targets[:, :-1])[0], tf.shape(targets[:, :-1])[1])), -1, 0)
    encoder_decoder_mask = tf.cast(tf.math.not_equal(inputs, 0), tf.float32)
    
    encoder_mask = encoder_mask[:, tf.newaxis, tf.newaxis, :]
    encoder_decoder_mask = encoder_decoder_mask[:, tf.newaxis, tf.newaxis, :]
    decoder_mask = decoder_mask[:, tf.newaxis, tf.newaxis, :]
    # all masks have shape = (batch_size, 1, 1, seq_len)
  
    return encoder_mask, encoder_decoder_mask, decoder_mask

# Initialize the Transformer model, optimizer, and loss function
transformer = Transformer(
    num_blocks = 5,
    d_model = 20,
    num_heads = 5,
    hidden_dim = 30,
    source_vocab_size = enc_vocab_size,
    target_vocab_size = dec_vocab_size,
    max_input_len = enc_seq_length,
    max_target_len = dec_seq_length
)

optimizer = tf.keras.optimizers.Adam()
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Initialize the metrics
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

num_epochs = 10
batch_size = 20

# Training loop
for epoch in range(num_epochs):
    train_loss.reset_states()
    train_accuracy.reset_states()
  
    # Iterate over the training dataset
    for batch_number in range(len(trainX) // batch_size + 1):
        inputs = trainX[batch_number * batch_size : (batch_number + 1) * batch_size]
        targets = trainY[batch_number * batch_size : (batch_number + 1) * batch_size]
        train_step(inputs, targets, transformer, optimizer, loss_function, train_loss, train_accuracy)
      
        # Print training progress every few batches
        if batch_number % 10 == 0:
            print(f'Epoch {epoch + 1} Batch {batch_number} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

Epoch 1 Batch 0 Loss 8.1697 Accuracy 0.0000
Epoch 1 Batch 0 Loss 7.8966 Accuracy 0.3818
Epoch 1 Batch 0 Loss 7.7377 Accuracy 0.5058
Epoch 1 Batch 0 Loss 7.5915 Accuracy 0.5483
Epoch 1 Batch 0 Loss 7.4422 Accuracy 0.5709
Epoch 2 Batch 0 Loss 6.6324 Accuracy 0.6347
Epoch 2 Batch 0 Loss 6.4889 Accuracy 0.6353
Epoch 2 Batch 0 Loss 6.3185 Accuracy 0.6386
Epoch 2 Batch 0 Loss 6.1489 Accuracy 0.6382
Epoch 2 Batch 0 Loss 5.9737 Accuracy 0.6389
Epoch 3 Batch 0 Loss 5.0490 Accuracy 0.6347
Epoch 3 Batch 0 Loss 4.9094 Accuracy 0.6353
Epoch 3 Batch 0 Loss 4.7314 Accuracy 0.6386
Epoch 3 Batch 0 Loss 4.5672 Accuracy 0.6382
Epoch 3 Batch 0 Loss 4.4019 Accuracy 0.6389
Epoch 4 Batch 0 Loss 3.5771 Accuracy 0.6347
Epoch 4 Batch 0 Loss 3.4898 Accuracy 0.6353
Epoch 4 Batch 0 Loss 3.3693 Accuracy 0.6386
Epoch 4 Batch 0 Loss 3.2916 Accuracy 0.6382
Epoch 4 Batch 0 Loss 3.2270 Accuracy 0.6389
Epoch 5 Batch 0 Loss 2.9886 Accuracy 0.6347
Epoch 5 Batch 0 Loss 3.0109 Accuracy 0.6353
Epoch 5 Batch 0 Loss 2.9793 Accu

In [84]:
enc_tokenizer.word_index.keys()

dict_keys(['the', 'end', 'start', 'of', 'in', 'and', 'to', 'a', 'he', 'observatory', 'was', 'that', 'is', 'at', 'for', 'telescope', 'his', 'first', 'canada', 'it', 'as', 'astronomy', 'astronomers', 'from', 'stars', 'by', 'university', 'an', 'with', 'observatories', 'its', 'radio', 'on', 'canadian', 'space', 'research', 'became', 'which', 'world', 'new', 'are', 'years', 'astronomical', 'work', 'toronto', 'ã‚â«', 'return', 'this', 'light', 'be', 'instruments', 'born', 'one', 'study', 'two', 'national', 'dominion', 'time', 'british', 'degree', 'were', 'also', 'ã¢â‚¬â€œ', 'astrophysical', 'received', 'universe', '1', 'our', 'evolution', 'ontario', 'large', 'columbia', 'department', 'used', 'their', 'star', 'would', 'about', 'year', 'astrophysics', 'during', 'institute', 'largest', 'established', 'images', 'quebec', 'mont', 'mãƒâ©gantic', 'began', 'mirror', 'physics', 'director', 'metre', 'most', 'project', 'several', 'society', 'many', 'other', 'david', 'professor', 'made', 'these', 'teles

In [103]:
# sentence1 = 'astronomers study stars'
# sentence2 = 'astronomical space research'
sentence1 = 'astronomers study stars'
sentence2 = 'columbia large department'

In [104]:
enc_tokenizer.texts_to_sequences([sentence2])

[[72, 71, 73]]

In [105]:
inputs1 = tf.constant(enc_tokenizer.texts_to_sequences([sentence1]))
inputs2 = tf.constant(enc_tokenizer.texts_to_sequences([sentence2]))
output1 = transformer.encoder(inputs1)[0]
output2 = transformer.encoder(inputs2)[0]

In [106]:
output1

<tf.Tensor: shape=(1, 3, 20), dtype=float32, numpy=
array([[[-0.67756903,  1.346677  , -0.5204264 , -0.46391895,
         -0.10949869, -1.0996535 , -1.6299821 , -0.89470893,
          0.85599315, -2.0965676 ,  0.3229088 ,  0.62217706,
          0.00554255,  1.4535601 ,  1.2204055 ,  0.04139756,
          0.5306589 ,  0.62773937,  0.5623679 , -0.1215599 ],
        [-0.6787881 ,  1.3456157 , -0.5213721 , -0.46405748,
         -0.10739103, -1.0955036 , -1.6484089 , -0.88720775,
          0.8426463 , -2.088242  ,  0.30834907,  0.62335247,
          0.01012551,  1.4568517 ,  1.2244682 ,  0.0489862 ,
          0.5317003 ,  0.6378933 ,  0.55352044, -0.11633141],
        [-0.6672824 ,  1.3579967 , -0.513648  , -0.4805963 ,
         -0.11982363, -1.0853963 , -1.6426896 , -0.86946636,
          0.86727303, -2.0912907 ,  0.2852972 ,  0.61864984,
         -0.00210325,  1.4409746 ,  1.2625933 ,  0.04186494,
          0.5309469 ,  0.63348925,  0.5315108 , -0.12131207]]],
      dtype=float32)>

In [107]:
output2

<tf.Tensor: shape=(1, 3, 20), dtype=float32, numpy=
array([[[-0.7490195 ,  1.427113  , -0.68845206, -0.3570645 ,
         -0.13416201, -1.0223509 , -1.5686895 , -0.71513605,
          0.69024277, -2.2789195 ,  0.37676978,  0.644699  ,
          0.14873789,  1.3522933 ,  1.2267828 ,  0.03221232,
          0.40405414,  0.61595446,  0.61161333, -0.04473148],
        [-0.7456242 ,  1.4265598 , -0.68790305, -0.3658358 ,
         -0.13345943, -1.0127387 , -1.566251  , -0.70938325,
          0.6936764 , -2.2849908 ,  0.36748698,  0.6420739 ,
          0.14543633,  1.3456935 ,  1.2424892 ,  0.03324926,
          0.40752795,  0.618866  ,  0.60586077, -0.05030824],
        [-0.74507993,  1.4317414 , -0.68603605, -0.36621174,
         -0.13867924, -1.0073918 , -1.5649184 , -0.7063888 ,
          0.6960495 , -2.286459  ,  0.35911527,  0.63833314,
          0.14775106,  1.3480332 ,  1.2405849 ,  0.02890521,
          0.41022167,  0.6205619 ,  0.6054014 , -0.05310567]]],
      dtype=float32)>

In [99]:
def cos_sim(a, b):
    return np.dot(a, b)/(np.linalg.norm(a) * np.linalg.norm(b))

In [108]:
(cos_sim(output1[0,0], output2[0,0]) + cos_sim(output1[0,1], output2[0,1]) + cos_sim(output1[0,2], output2[0,2])) / 3

0.9938858350118002