# Stage 1: Importing dependencies

In [1]:
import numpy as np
import pandas as pd
import re
import time
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [2]:
tf.__version__

'2.5.0'

# Stage 2: Data preprocessing

## Loading files

We import files from our personal google drive.

In [3]:
with open("europarl-v7.fr-en.en",mode="r",encoding="utf-8") as f:
    europarl_en=f.read()
with open("europarl-v7.fr-en.fr",mode="r",encoding="utf-8") as f:
    europarl_fr=f.read()
with open("P85-Non-Breaking-Prefix.en",mode="r",encoding="utf-8") as f:
    non_breaking_prefix_en=f.read()
with open("P85-Non-Breaking-Prefix.fr",mode="r",encoding="utf-8") as f:
    non_breaking_prefix_fr=f.read()

## Cleaning data

Getting the non_breaking_prefixes as a clean list of words with a point at the end so it is easier to use.

In [4]:
non_breaking_prefix_en = non_breaking_prefix_en.split("\n")
non_breaking_prefix_en = [' ' + pref + '.' for pref in non_breaking_prefix_en]
non_breaking_prefix_fr = non_breaking_prefix_fr.split("\n")
non_breaking_prefix_fr = [' ' + pref + '.' for pref in non_breaking_prefix_fr]

We will need each word and other symbol that we want to keep to be in lower case and separated by spaces so we can "tokenize" them.

In [5]:
corpus_en = europarl_en
# Add $$$ after non ending sentence points
for prefix in non_breaking_prefix_en:
    corpus_en = corpus_en.replace(prefix, prefix + '$$$')
corpus_en = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])", ".$$$", corpus_en)
# Remove $$$ markers
corpus_en = re.sub(r".\$\$\$", '', corpus_en)
# Clear multiple spaces
corpus_en = re.sub(r"  +", " ", corpus_en)
corpus_en = corpus_en.split('\n')

corpus_fr = europarl_fr
for prefix in non_breaking_prefix_fr:
    corpus_fr = corpus_fr.replace(prefix, prefix + '$$$')
corpus_fr = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])", ".$$$", corpus_fr)
corpus_fr = re.sub(r".\$\$\$", '', corpus_fr)
corpus_fr = re.sub(r"  +", " ", corpus_fr)
corpus_fr = corpus_fr.split('\n')

## Tokenizing text

In [6]:
tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    corpus_en, target_vocab_size=2**13)
tokenizer_fr = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    corpus_fr, target_vocab_size=2**13)

In [7]:
VOCAB_SIZE_EN = tokenizer_en.vocab_size + 2 # = 8190
VOCAB_SIZE_FR = tokenizer_fr.vocab_size + 2 # = 8171

In [8]:
inputs = [[VOCAB_SIZE_EN-2] + tokenizer_en.encode(sentence) + [VOCAB_SIZE_EN-1]
          for sentence in corpus_en]
outputs = [[VOCAB_SIZE_FR-2] + tokenizer_fr.encode(sentence) + [VOCAB_SIZE_FR-1]
           for sentence in corpus_fr]

## Remove too long sentences

In [9]:
MAX_LENGTH = 20
idx_to_remove = [count for count, sent in enumerate(inputs)
                 if len(sent) > MAX_LENGTH]
for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]
idx_to_remove = [count for count, sent in enumerate(outputs)
                 if len(sent) > MAX_LENGTH]
for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]

## Inputs/outputs creation

As we train with batches, we need each input to have the same length. We pad with the appropriate token, and we will make sure this padding token doesn't interfere with our training later.

In [10]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                       value=0,
                                                       padding='post',
                                                       maxlen=MAX_LENGTH)
outputs = tf.keras.preprocessing.sequence.pad_sequences(outputs,
                                                        value=0,
                                                        padding='post',
                                                        maxlen=MAX_LENGTH)

In [11]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

# Stage 3: Model building

## Embedding

Positional encoding formulae:

$PE_{(pos,2i)} =\sin(pos/10000^{2i/dmodel})$

$PE_{(pos,2i+1)} =\cos(pos/10000^{2i/dmodel})$

In [12]:
class PositionalEncoding(layers.Layer):

    def __init__(self):
        super(PositionalEncoding, self).__init__()
    
    def get_angles(self, pos, i, d_model):
        angles = 1 / np.power(10000., (2*(i//2)) / np.float32(d_model))
        return pos * angles

    def call(self, inputs):
        seq_length = inputs.shape.as_list()[-2]
        d_model = inputs.shape.as_list()[-1]
        angles = self.get_angles(np.arange(seq_length)[:, np.newaxis],
                                 np.arange(d_model)[np.newaxis, :],
                                 d_model)
        angles[:, 0::2] = np.sin(angles[:, 0::2])
        angles[:, 1::2] = np.cos(angles[:, 1::2])
        pos_encoding = angles[np.newaxis, ...]
        return inputs + tf.cast(pos_encoding, tf.float32)

## Attention

### Attention computation

$Attention(Q, K, V ) = \text{softmax}\left(\dfrac{QK^T}{\sqrt{d_k}}\right)V $

In [13]:
def scaled_dot_product_attention(queries, keys, values, mask):
    product = tf.matmul(queries, keys, transpose_b=True)
    
    keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)
    scaled_product = product / tf.math.sqrt(keys_dim)
    
    if mask is not None:
        scaled_product += (mask * -1e9)
    
    attention = tf.matmul(tf.nn.softmax(scaled_product, axis=-1), values)
    
    return attention

### Multi-head attention sublayer

In [14]:
class MultiHeadAttention(layers.Layer):
    
    def __init__(self, nb_proj):
        super(MultiHeadAttention, self).__init__()
        self.nb_proj = nb_proj
        
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        assert self.d_model % self.nb_proj == 0
        
        self.d_proj = self.d_model // self.nb_proj
        
        self.query_lin = layers.Dense(units=self.d_model)
        self.key_lin = layers.Dense(units=self.d_model)
        self.value_lin = layers.Dense(units=self.d_model)
        
        self.final_lin = layers.Dense(units=self.d_model)
        
    def split_proj(self, inputs, batch_size): # inputs: (batch_size, seq_length, d_model)
        shape = (batch_size,
                 -1,
                 self.nb_proj,
                 self.d_proj)
        splited_inputs = tf.reshape(inputs, shape=shape) # (batch_size, seq_length, nb_proj, d_proj)
        return tf.transpose(splited_inputs, perm=[0, 2, 1, 3]) # (batch_size, nb_proj, seq_length, d_proj)
    
    def call(self, queries, keys, values, mask):
        batch_size = tf.shape(queries)[0]
        
        queries = self.query_lin(queries)
        keys = self.key_lin(keys)
        values = self.value_lin(values)
        
        queries = self.split_proj(queries, batch_size)
        keys = self.split_proj(keys, batch_size)
        values = self.split_proj(values, batch_size)
        
        attention = scaled_dot_product_attention(queries, keys, values, mask)
        
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        
        concat_attention = tf.reshape(attention,
                                      shape=(batch_size, -1, self.d_model))
        
        outputs = self.final_lin(concat_attention)
        
        return outputs

## Encoder

In [15]:
class EncoderLayer(layers.Layer):
    
    def __init__(self, FFN_units, nb_proj, dropout_rate):
        super(EncoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout_rate = dropout_rate
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        
        self.multi_head_attention = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        
        self.dense_1 = layers.Dense(units=self.FFN_units, activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs, mask, training):
        attention = self.multi_head_attention(inputs,
                                              inputs,
                                              inputs,
                                              mask)
        attention = self.dropout_1(attention, training=training)
        attention = self.norm_1(attention + inputs)
        
        outputs = self.dense_1(attention)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_2(outputs, training=training)
        outputs = self.norm_2(outputs + attention)
        
        return outputs

In [16]:
class Encoder(layers.Layer):
    
    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout_rate,
                 vocab_size,
                 d_model,
                 name="encoder"):
        super(Encoder, self).__init__(name=name)
        self.nb_layers = nb_layers
        self.d_model = d_model
        
        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout_rate)
        self.enc_layers = [EncoderLayer(FFN_units,
                                        nb_proj,
                                        dropout_rate) 
                           for _ in range(nb_layers)]
    
    def call(self, inputs, mask, training):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)
        
        for i in range(self.nb_layers):
            outputs = self.enc_layers[i](outputs, mask, training)

        return outputs

## Decoder

In [17]:
class DecoderLayer(layers.Layer):
    
    def __init__(self, FFN_units, nb_proj, dropout_rate):
        super(DecoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout_rate = dropout_rate
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        
        # Self multi head attention
        self.multi_head_attention_1 = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        
        # Multi head attention combined with encoder output
        self.multi_head_attention_2 = MultiHeadAttention(self.nb_proj)
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
        
        # Feed foward
        self.dense_1 = layers.Dense(units=self.FFN_units,
                                    activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_3 = layers.Dropout(rate=self.dropout_rate)
        self.norm_3 = layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        attention = self.multi_head_attention_1(inputs,
                                                inputs,
                                                inputs,
                                                mask_1)
        attention = self.dropout_1(attention, training)
        attention = self.norm_1(attention + inputs)
        
        attention_2 = self.multi_head_attention_2(attention,
                                                  enc_outputs,
                                                  enc_outputs,
                                                  mask_2)
        attention_2 = self.dropout_2(attention_2, training)
        attention_2 = self.norm_2(attention_2 + attention)
        
        outputs = self.dense_1(attention_2)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_3(outputs, training)
        outputs = self.norm_3(outputs + attention_2)
        
        return outputs

In [18]:
class Decoder(layers.Layer):
    
    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout_rate,
                 vocab_size,
                 d_model,
                 name="decoder"):
        super(Decoder, self).__init__(name=name)
        self.d_model = d_model
        self.nb_layers = nb_layers
        
        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout_rate)
        
        self.dec_layers = [DecoderLayer(FFN_units,
                                        nb_proj,
                                        dropout_rate) 
                           for i in range(nb_layers)]
    
    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)
        
        for i in range(self.nb_layers):
            outputs = self.dec_layers[i](outputs,
                                         enc_outputs,
                                         mask_1,
                                         mask_2,
                                         training)

        return outputs

## Transformer

In [19]:
class Transformer(tf.keras.Model):
    
    def __init__(self,
                 vocab_size_enc,
                 vocab_size_dec,
                 d_model,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout_rate,
                 name="transformer"):
        super(Transformer, self).__init__(name=name)
        
        self.encoder = Encoder(nb_layers,
                               FFN_units,
                               nb_proj,
                               dropout_rate,
                               vocab_size_enc,
                               d_model)
        self.decoder = Decoder(nb_layers,
                               FFN_units,
                               nb_proj,
                               dropout_rate,
                               vocab_size_dec,
                               d_model)
        self.last_linear = layers.Dense(units=vocab_size_dec, name="lin_ouput")
    
    def create_padding_mask(self, seq):
        mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
        return mask[:, tf.newaxis, tf.newaxis, :]

    def create_look_ahead_mask(self, seq):
        seq_len = tf.shape(seq)[1]
        look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return look_ahead_mask
    
    def call(self, enc_inputs, dec_inputs, training):
        enc_mask = self.create_padding_mask(enc_inputs)
        dec_mask_1 = tf.maximum(
            self.create_padding_mask(dec_inputs),
            self.create_look_ahead_mask(dec_inputs)
        )
        dec_mask_2 = self.create_padding_mask(enc_inputs)
        
        enc_outputs = self.encoder(enc_inputs, enc_mask, training)
        dec_outputs = self.decoder(dec_inputs,
                                   enc_outputs,
                                   dec_mask_1,
                                   dec_mask_2,
                                   training)
        
        outputs = self.last_linear(dec_outputs)
        
        return outputs

# Training

In [20]:
tf.keras.backend.clear_session()

# Hyper-parameters
D_MODEL = 128 # 512
NB_LAYERS = 4 # 6
FFN_UNITS = 512 # 2048
NB_PROJ = 8 # 8
DROPOUT_RATE = 0.1 # 0.1

transformer = Transformer(vocab_size_enc=VOCAB_SIZE_EN,
                          vocab_size_dec=VOCAB_SIZE_FR,
                          d_model=D_MODEL,
                          nb_layers=NB_LAYERS,
                          FFN_units=FFN_UNITS,
                          nb_proj=NB_PROJ,
                          dropout_rate=DROPOUT_RATE)

In [21]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction="none")

def loss_function(target, pred):
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    loss_ = loss_object(target, pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

train_loss = tf.keras.metrics.Mean(name="train_loss")
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="train_accuracy")

In [22]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)
        
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

leaning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(leaning_rate,
                                     beta_1=0.9,
                                     beta_2=0.98,
                                     epsilon=1e-9)
        

In [23]:
checkpoint_path = "./TF/ckpt/"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

Latest checkpoint restored!!


In [24]:
EPOCHS = 15
for epoch in range(EPOCHS):
    print("Start of epoch {}".format(epoch+1))
    start = time.time()
    
    train_loss.reset_states()
    train_accuracy.reset_states()
    
    for (batch, (enc_inputs, targets)) in enumerate(dataset):
        dec_inputs = targets[:, :-1]
        dec_outputs_real = targets[:, 1:]
        with tf.GradientTape() as tape:
            predictions = transformer(enc_inputs, dec_inputs, True)
            loss = loss_function(dec_outputs_real, predictions)
        
        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
        
        train_loss(loss)
        train_accuracy(dec_outputs_real, predictions)
        
        if batch % 50 == 0:
            print("Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}".format(
                epoch+1, batch, train_loss.result(), train_accuracy.result()))
            
    ckpt_save_path = ckpt_manager.save()
    print("Saving checkpoint for epoch {} at {}".format(epoch+1,
                                                        ckpt_save_path))
    print("Time taken for 1 epoch: {} secs\n".format(time.time() - start))

Start of epoch 1
Epoch 1 Batch 0 Loss 1.7042 Accuracy 0.4054
Epoch 1 Batch 50 Loss 1.7087 Accuracy 0.3829
Epoch 1 Batch 100 Loss 1.7013 Accuracy 0.3857
Epoch 1 Batch 150 Loss 1.6940 Accuracy 0.3868
Epoch 1 Batch 200 Loss 1.6922 Accuracy 0.3863
Epoch 1 Batch 250 Loss 1.6849 Accuracy 0.3861
Epoch 1 Batch 300 Loss 1.6783 Accuracy 0.3869
Epoch 1 Batch 350 Loss 1.6688 Accuracy 0.3871
Epoch 1 Batch 400 Loss 1.6663 Accuracy 0.3876
Epoch 1 Batch 450 Loss 1.6634 Accuracy 0.3881
Epoch 1 Batch 500 Loss 1.6598 Accuracy 0.3882
Epoch 1 Batch 550 Loss 1.6506 Accuracy 0.3890
Epoch 1 Batch 600 Loss 1.6499 Accuracy 0.3893
Epoch 1 Batch 650 Loss 1.6457 Accuracy 0.3899
Epoch 1 Batch 700 Loss 1.6370 Accuracy 0.3911
Epoch 1 Batch 750 Loss 1.6340 Accuracy 0.3918
Epoch 1 Batch 800 Loss 1.6316 Accuracy 0.3924
Epoch 1 Batch 850 Loss 1.6293 Accuracy 0.3929
Epoch 1 Batch 900 Loss 1.6264 Accuracy 0.3935
Epoch 1 Batch 950 Loss 1.6218 Accuracy 0.3937
Epoch 1 Batch 1000 Loss 1.6175 Accuracy 0.3939
Epoch 1 Batch 1050 

Epoch 2 Batch 2900 Loss 1.2013 Accuracy 0.4587
Epoch 2 Batch 2950 Loss 1.1985 Accuracy 0.4590
Epoch 2 Batch 3000 Loss 1.1960 Accuracy 0.4594
Epoch 2 Batch 3050 Loss 1.1941 Accuracy 0.4597
Epoch 2 Batch 3100 Loss 1.1916 Accuracy 0.4601
Epoch 2 Batch 3150 Loss 1.1889 Accuracy 0.4604
Epoch 2 Batch 3200 Loss 1.1865 Accuracy 0.4607
Epoch 2 Batch 3250 Loss 1.1842 Accuracy 0.4611
Epoch 2 Batch 3300 Loss 1.1814 Accuracy 0.4614
Epoch 2 Batch 3350 Loss 1.1787 Accuracy 0.4617
Epoch 2 Batch 3400 Loss 1.1764 Accuracy 0.4620
Epoch 2 Batch 3450 Loss 1.1739 Accuracy 0.4624
Epoch 2 Batch 3500 Loss 1.1722 Accuracy 0.4628
Epoch 2 Batch 3550 Loss 1.1698 Accuracy 0.4632
Epoch 2 Batch 3600 Loss 1.1673 Accuracy 0.4636
Epoch 2 Batch 3650 Loss 1.1648 Accuracy 0.4639
Epoch 2 Batch 3700 Loss 1.1626 Accuracy 0.4644
Epoch 2 Batch 3750 Loss 1.1606 Accuracy 0.4648
Epoch 2 Batch 3800 Loss 1.1586 Accuracy 0.4651
Epoch 2 Batch 3850 Loss 1.1567 Accuracy 0.4656
Epoch 2 Batch 3900 Loss 1.1548 Accuracy 0.4660
Epoch 2 Batch

Epoch 4 Batch 0 Loss 1.2725 Accuracy 0.4589
Epoch 4 Batch 50 Loss 1.1762 Accuracy 0.4602
Epoch 4 Batch 100 Loss 1.1564 Accuracy 0.4631
Epoch 4 Batch 150 Loss 1.1564 Accuracy 0.4635
Epoch 4 Batch 200 Loss 1.1514 Accuracy 0.4641
Epoch 4 Batch 250 Loss 1.1481 Accuracy 0.4645
Epoch 4 Batch 300 Loss 1.1501 Accuracy 0.4650
Epoch 4 Batch 350 Loss 1.1493 Accuracy 0.4653
Epoch 4 Batch 400 Loss 1.1469 Accuracy 0.4658
Epoch 4 Batch 450 Loss 1.1460 Accuracy 0.4659
Epoch 4 Batch 500 Loss 1.1448 Accuracy 0.4651
Epoch 4 Batch 550 Loss 1.1453 Accuracy 0.4652
Epoch 4 Batch 600 Loss 1.1419 Accuracy 0.4647
Epoch 4 Batch 650 Loss 1.1421 Accuracy 0.4650
Epoch 4 Batch 700 Loss 1.1404 Accuracy 0.4652
Epoch 4 Batch 750 Loss 1.1407 Accuracy 0.4657
Epoch 4 Batch 800 Loss 1.1405 Accuracy 0.4658
Epoch 4 Batch 850 Loss 1.1417 Accuracy 0.4663
Epoch 4 Batch 900 Loss 1.1401 Accuracy 0.4664
Epoch 4 Batch 950 Loss 1.1366 Accuracy 0.4664
Epoch 4 Batch 1000 Loss 1.1335 Accuracy 0.4666
Epoch 4 Batch 1050 Loss 1.1325 Accur

Epoch 5 Batch 2900 Loss 1.0037 Accuracy 0.4907
Epoch 5 Batch 2950 Loss 1.0019 Accuracy 0.4909
Epoch 5 Batch 3000 Loss 0.9997 Accuracy 0.4911
Epoch 5 Batch 3050 Loss 0.9980 Accuracy 0.4914
Epoch 5 Batch 3100 Loss 0.9960 Accuracy 0.4916
Epoch 5 Batch 3150 Loss 0.9941 Accuracy 0.4918
Epoch 5 Batch 3200 Loss 0.9922 Accuracy 0.4921
Epoch 5 Batch 3250 Loss 0.9899 Accuracy 0.4924
Epoch 5 Batch 3300 Loss 0.9878 Accuracy 0.4927
Epoch 5 Batch 3350 Loss 0.9855 Accuracy 0.4930
Epoch 5 Batch 3400 Loss 0.9835 Accuracy 0.4933
Epoch 5 Batch 3450 Loss 0.9817 Accuracy 0.4936
Epoch 5 Batch 3500 Loss 0.9798 Accuracy 0.4939
Epoch 5 Batch 3550 Loss 0.9782 Accuracy 0.4943
Epoch 5 Batch 3600 Loss 0.9761 Accuracy 0.4946
Epoch 5 Batch 3650 Loss 0.9741 Accuracy 0.4949
Epoch 5 Batch 3700 Loss 0.9728 Accuracy 0.4952
Epoch 5 Batch 3750 Loss 0.9712 Accuracy 0.4956
Epoch 5 Batch 3800 Loss 0.9697 Accuracy 0.4959
Epoch 5 Batch 3850 Loss 0.9687 Accuracy 0.4962
Epoch 5 Batch 3900 Loss 0.9671 Accuracy 0.4965
Epoch 5 Batch

Epoch 7 Batch 0 Loss 1.1024 Accuracy 0.4137
Epoch 7 Batch 50 Loss 1.0605 Accuracy 0.4840
Epoch 7 Batch 100 Loss 1.0764 Accuracy 0.4801
Epoch 7 Batch 150 Loss 1.0667 Accuracy 0.4798
Epoch 7 Batch 200 Loss 1.0601 Accuracy 0.4795
Epoch 7 Batch 250 Loss 1.0629 Accuracy 0.4801
Epoch 7 Batch 300 Loss 1.0624 Accuracy 0.4804
Epoch 7 Batch 350 Loss 1.0594 Accuracy 0.4806
Epoch 7 Batch 400 Loss 1.0593 Accuracy 0.4806
Epoch 7 Batch 450 Loss 1.0550 Accuracy 0.4806
Epoch 7 Batch 500 Loss 1.0531 Accuracy 0.4812
Epoch 7 Batch 550 Loss 1.0502 Accuracy 0.4810
Epoch 7 Batch 600 Loss 1.0474 Accuracy 0.4812
Epoch 7 Batch 650 Loss 1.0475 Accuracy 0.4815
Epoch 7 Batch 700 Loss 1.0472 Accuracy 0.4814
Epoch 7 Batch 750 Loss 1.0462 Accuracy 0.4818
Epoch 7 Batch 800 Loss 1.0442 Accuracy 0.4819
Epoch 7 Batch 850 Loss 1.0437 Accuracy 0.4817
Epoch 7 Batch 900 Loss 1.0424 Accuracy 0.4820
Epoch 7 Batch 950 Loss 1.0416 Accuracy 0.4822
Epoch 7 Batch 1000 Loss 1.0404 Accuracy 0.4823
Epoch 7 Batch 1050 Loss 1.0386 Accur

Epoch 8 Batch 2900 Loss 0.9343 Accuracy 0.5022
Epoch 8 Batch 2950 Loss 0.9323 Accuracy 0.5025
Epoch 8 Batch 3000 Loss 0.9306 Accuracy 0.5027
Epoch 8 Batch 3050 Loss 0.9290 Accuracy 0.5030
Epoch 8 Batch 3100 Loss 0.9270 Accuracy 0.5032
Epoch 8 Batch 3150 Loss 0.9253 Accuracy 0.5034
Epoch 8 Batch 3200 Loss 0.9236 Accuracy 0.5036
Epoch 8 Batch 3250 Loss 0.9216 Accuracy 0.5038
Epoch 8 Batch 3300 Loss 0.9196 Accuracy 0.5041
Epoch 8 Batch 3350 Loss 0.9174 Accuracy 0.5043
Epoch 8 Batch 3400 Loss 0.9155 Accuracy 0.5046
Epoch 8 Batch 3450 Loss 0.9137 Accuracy 0.5049
Epoch 8 Batch 3500 Loss 0.9121 Accuracy 0.5052
Epoch 8 Batch 3550 Loss 0.9106 Accuracy 0.5056
Epoch 8 Batch 3600 Loss 0.9087 Accuracy 0.5059
Epoch 8 Batch 3650 Loss 0.9073 Accuracy 0.5062
Epoch 8 Batch 3700 Loss 0.9058 Accuracy 0.5065
Epoch 8 Batch 3750 Loss 0.9044 Accuracy 0.5068
Epoch 8 Batch 3800 Loss 0.9029 Accuracy 0.5072
Epoch 8 Batch 3850 Loss 0.9016 Accuracy 0.5074
Epoch 8 Batch 3900 Loss 0.9001 Accuracy 0.5078
Epoch 8 Batch

Epoch 10 Batch 0 Loss 0.8046 Accuracy 0.4794
Epoch 10 Batch 50 Loss 1.0251 Accuracy 0.4874
Epoch 10 Batch 100 Loss 1.0280 Accuracy 0.4860
Epoch 10 Batch 150 Loss 1.0216 Accuracy 0.4856
Epoch 10 Batch 200 Loss 1.0165 Accuracy 0.4861
Epoch 10 Batch 250 Loss 1.0146 Accuracy 0.4873
Epoch 10 Batch 300 Loss 1.0117 Accuracy 0.4875
Epoch 10 Batch 350 Loss 1.0086 Accuracy 0.4879
Epoch 10 Batch 400 Loss 1.0082 Accuracy 0.4881
Epoch 10 Batch 450 Loss 1.0039 Accuracy 0.4882
Epoch 10 Batch 500 Loss 1.0049 Accuracy 0.4883
Epoch 10 Batch 550 Loss 1.0031 Accuracy 0.4877
Epoch 10 Batch 600 Loss 1.0016 Accuracy 0.4880
Epoch 10 Batch 650 Loss 1.0002 Accuracy 0.4884
Epoch 10 Batch 700 Loss 0.9987 Accuracy 0.4890
Epoch 10 Batch 750 Loss 0.9976 Accuracy 0.4892
Epoch 10 Batch 800 Loss 0.9967 Accuracy 0.4898
Epoch 10 Batch 850 Loss 0.9956 Accuracy 0.4898
Epoch 10 Batch 900 Loss 0.9946 Accuracy 0.4898
Epoch 10 Batch 950 Loss 0.9925 Accuracy 0.4901
Epoch 10 Batch 1000 Loss 0.9906 Accuracy 0.4902
Epoch 10 Batch 

Epoch 11 Batch 2750 Loss 0.9009 Accuracy 0.5080
Epoch 11 Batch 2800 Loss 0.8989 Accuracy 0.5082
Epoch 11 Batch 2850 Loss 0.8971 Accuracy 0.5085
Epoch 11 Batch 2900 Loss 0.8951 Accuracy 0.5087
Epoch 11 Batch 2950 Loss 0.8928 Accuracy 0.5089
Epoch 11 Batch 3000 Loss 0.8909 Accuracy 0.5092
Epoch 11 Batch 3050 Loss 0.8894 Accuracy 0.5095
Epoch 11 Batch 3100 Loss 0.8877 Accuracy 0.5099
Epoch 11 Batch 3150 Loss 0.8860 Accuracy 0.5101
Epoch 11 Batch 3200 Loss 0.8842 Accuracy 0.5104
Epoch 11 Batch 3250 Loss 0.8823 Accuracy 0.5105
Epoch 11 Batch 3300 Loss 0.8804 Accuracy 0.5109
Epoch 11 Batch 3350 Loss 0.8784 Accuracy 0.5112
Epoch 11 Batch 3400 Loss 0.8766 Accuracy 0.5115
Epoch 11 Batch 3450 Loss 0.8751 Accuracy 0.5117
Epoch 11 Batch 3500 Loss 0.8734 Accuracy 0.5120
Epoch 11 Batch 3550 Loss 0.8713 Accuracy 0.5124
Epoch 11 Batch 3600 Loss 0.8695 Accuracy 0.5126
Epoch 11 Batch 3650 Loss 0.8677 Accuracy 0.5129
Epoch 11 Batch 3700 Loss 0.8661 Accuracy 0.5132
Epoch 11 Batch 3750 Loss 0.8643 Accuracy

Epoch 12 Batch 5450 Loss 0.8764 Accuracy 0.5119
Epoch 12 Batch 5500 Loss 0.8774 Accuracy 0.5116
Epoch 12 Batch 5550 Loss 0.8783 Accuracy 0.5114
Epoch 12 Batch 5600 Loss 0.8792 Accuracy 0.5111
Epoch 12 Batch 5650 Loss 0.8803 Accuracy 0.5109
Epoch 12 Batch 5700 Loss 0.8813 Accuracy 0.5106
Saving checkpoint for epoch 12 at ./TF/ckpt/ckpt-13
Time taken for 1 epoch: 1469.0225212574005 secs

Start of epoch 13
Epoch 13 Batch 0 Loss 1.0444 Accuracy 0.4803
Epoch 13 Batch 50 Loss 0.9822 Accuracy 0.4965
Epoch 13 Batch 100 Loss 0.9788 Accuracy 0.4946
Epoch 13 Batch 150 Loss 0.9852 Accuracy 0.4956
Epoch 13 Batch 200 Loss 0.9883 Accuracy 0.4952
Epoch 13 Batch 250 Loss 0.9868 Accuracy 0.4950
Epoch 13 Batch 300 Loss 0.9848 Accuracy 0.4946
Epoch 13 Batch 350 Loss 0.9801 Accuracy 0.4945
Epoch 13 Batch 400 Loss 0.9778 Accuracy 0.4942
Epoch 13 Batch 450 Loss 0.9764 Accuracy 0.4942
Epoch 13 Batch 500 Loss 0.9756 Accuracy 0.4941
Epoch 13 Batch 550 Loss 0.9721 Accuracy 0.4938
Epoch 13 Batch 600 Loss 0.9711 A

Epoch 14 Batch 2300 Loss 0.8957 Accuracy 0.5101
Epoch 14 Batch 2350 Loss 0.8929 Accuracy 0.5104
Epoch 14 Batch 2400 Loss 0.8903 Accuracy 0.5107
Epoch 14 Batch 2450 Loss 0.8873 Accuracy 0.5110
Epoch 14 Batch 2500 Loss 0.8848 Accuracy 0.5113
Epoch 14 Batch 2550 Loss 0.8818 Accuracy 0.5117
Epoch 14 Batch 2600 Loss 0.8794 Accuracy 0.5120
Epoch 14 Batch 2650 Loss 0.8769 Accuracy 0.5122
Epoch 14 Batch 2700 Loss 0.8748 Accuracy 0.5126
Epoch 14 Batch 2750 Loss 0.8726 Accuracy 0.5129
Epoch 14 Batch 2800 Loss 0.8705 Accuracy 0.5131
Epoch 14 Batch 2850 Loss 0.8683 Accuracy 0.5133
Epoch 14 Batch 2900 Loss 0.8664 Accuracy 0.5136
Epoch 14 Batch 2950 Loss 0.8644 Accuracy 0.5139
Epoch 14 Batch 3000 Loss 0.8630 Accuracy 0.5140
Epoch 14 Batch 3050 Loss 0.8613 Accuracy 0.5143
Epoch 14 Batch 3100 Loss 0.8598 Accuracy 0.5146
Epoch 14 Batch 3150 Loss 0.8578 Accuracy 0.5148
Epoch 14 Batch 3200 Loss 0.8562 Accuracy 0.5149
Epoch 14 Batch 3250 Loss 0.8545 Accuracy 0.5151
Epoch 14 Batch 3300 Loss 0.8526 Accuracy

Epoch 15 Batch 5000 Loss 0.8406 Accuracy 0.5187
Epoch 15 Batch 5050 Loss 0.8417 Accuracy 0.5184
Epoch 15 Batch 5100 Loss 0.8429 Accuracy 0.5181
Epoch 15 Batch 5150 Loss 0.8445 Accuracy 0.5179
Epoch 15 Batch 5200 Loss 0.8459 Accuracy 0.5176
Epoch 15 Batch 5250 Loss 0.8474 Accuracy 0.5173
Epoch 15 Batch 5300 Loss 0.8484 Accuracy 0.5170
Epoch 15 Batch 5350 Loss 0.8495 Accuracy 0.5167
Epoch 15 Batch 5400 Loss 0.8507 Accuracy 0.5163
Epoch 15 Batch 5450 Loss 0.8518 Accuracy 0.5161
Epoch 15 Batch 5500 Loss 0.8528 Accuracy 0.5159
Epoch 15 Batch 5550 Loss 0.8537 Accuracy 0.5156
Epoch 15 Batch 5600 Loss 0.8546 Accuracy 0.5154
Epoch 15 Batch 5650 Loss 0.8554 Accuracy 0.5152
Epoch 15 Batch 5700 Loss 0.8564 Accuracy 0.5149
Saving checkpoint for epoch 15 at ./TF/ckpt/ckpt-16
Time taken for 1 epoch: 1526.1194899082184 secs



# Evaluating

In [29]:
def evaluate(inp_sentence):
    inp_sentence=\
        [VOCAB_SIZE_EN-2]+tokenizer_en.encode(inp_sentence)+ [VOCAB_SIZE_EN-1]
    enc_input=tf.expand_dims(inp_sentence,axis=0)
    output=tf.expand_dims([VOCAB_SIZE_FR-2],axis=0)
    
    for _ in range(MAX_LENGTH):
        predictions=transformer(enc_input,output,False)
        prediction=predictions[:,-1:,:]
        predicted_id=tf.cast(tf.argmax(prediction,axis=-1),tf.int32)
        if predicted_id==VOCAB_SIZE_FR-1:
            return tf.squeeze(output,axis=0)
        output=tf.concat([output,predicted_id],axis=-1)
    return (tf.squeeze(output,axis=0))


In [30]:
def translate(sentence):
    output=evaluate(sentence).numpy()
    
    predicted_sentence=tokenizer_fr.decode([i for i in output if i <VOCAB_SIZE_FR-2])
    
    print("Input:{}".format(sentence))
    print("Predicted_translation:{}".format(predicted_sentence))

In [34]:
translate("This is my car")

Input:This is my car
Predicted_translation:Voilà ma voiture.
