In [2]:
import tensorflow_datasets as tfds
import tensorflow as tf
import re
import time
import numpy as np

In [3]:
lines = open('movie_lines.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')
conversations = open('movie_conversations.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')

## Data Preprocessing

In [4]:
# Creating a dictionary that maps each line and its id
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

In [5]:
# Creating a list of all of the conversations
conversations_ids = []
for conversation in conversations[:-1]:
    _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
    conversations_ids.append(_conversation.split(','))

In [6]:
# Getting separately the questions and the answers
questions = []
answers = []
for conversation in conversations_ids:
    for i in range(len(conversation) - 1):
        questions.append(id2line[conversation[i]])
        answers.append(id2line[conversation[i+1]])

In [7]:
# Doing a first cleaning of the texts
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    return text

In [8]:
# Cleaning the questions
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))

In [9]:
# Cleaning the answers
clean_answers = []
for answer in answers:
    clean_answers.append(clean_text(answer))

In [16]:
clean_questions[:10]

['can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again',
 'well i thought we would start with pronunciation if that is okay with you',
 'not the hacking and gagging and spitting part  please',
 'you are asking me out  that is so cute what is your name again',
 "no no it's my fault  we didn't have a proper introduction ",
 'cameron',
 'the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does',
 'why',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something',
 'gosh if only we could find kat a boyfriend']

In [18]:
clean_answers[:10]

['well i thought we would start with pronunciation if that is okay with you',
 'not the hacking and gagging and spitting part  please',
 "okay then how 'bout we try out some french cuisine  saturday  night",
 'forget it',
 'cameron',
 'the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does',
 'seems like she could get a date easy enough',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something',
 'that is a shame',
 'let me see what i can do']

In [19]:
#Tokenizing text
tokenizer_qst = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    clean_questions, target_vocab_size=2**13)
tokenizer_ans = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    clean_answers, target_vocab_size=2**13)

In [20]:
VOCAB_SIZE_QST = tokenizer_qst.vocab_size + 2 
VOCAB_SIZE_ANS = tokenizer_ans.vocab_size + 2 

In [22]:
inputs = [[VOCAB_SIZE_QST-2] + tokenizer_qst.encode(sentence) + [VOCAB_SIZE_QST-1]
          for sentence in clean_questions]
outputs = [[VOCAB_SIZE_ANS-2] + tokenizer_ans.encode(sentence) + [VOCAB_SIZE_ANS-1]
           for sentence in clean_answers]

In [24]:
sample_string = 'she used to be really popular when she started high school.'

tokenized_string = tokenizer_qst.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))
original_string = tokenizer_qst.decode(tokenized_string)
print ('The original string: {}'.format(original_string))
assert original_string == sample_string

Tokenized string is [48, 303, 5, 34, 123, 6672, 78, 48, 716, 619, 754, 7901]
The original string: she used to be really popular when she started high school.


In [26]:
for ts in tokenized_string:
  print ('{} ----> {}'.format(ts, tokenizer_qst.decode([ts])))

48 ----> she 
303 ----> used 
5 ----> to 
34 ----> be 
123 ----> really 
6672 ----> popular 
78 ----> when 
48 ----> she 
716 ----> started 
619 ----> high 
754 ----> school
7901 ----> .


In [27]:
#Remove too long sentences
MAX_LENGTH = 20
idx_to_remove = [count for count, sent in enumerate(inputs)
                 if len(sent) > MAX_LENGTH]
for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]
idx_to_remove = [count for count, sent in enumerate(outputs)
                 if len(sent) > MAX_LENGTH]
for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]

In [28]:
#Inputs/outputs
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                       value=0,
                                                       padding='post',
                                                       maxlen=MAX_LENGTH)
outputs = tf.keras.preprocessing.sequence.pad_sequences(outputs,
                                                        value=0,
                                                        padding='post',
                                                        maxlen=MAX_LENGTH)

In [29]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

## Model

### Embedding
### Positional encoding

In [31]:
from tensorflow.keras import layers

class PositionalEncoding(layers.Layer):

    def __init__(self):
        super(PositionalEncoding, self).__init__()
    
    def get_angles(self, pos, i, d_model):
        angles = 1 / np.power(10000., (2*(i//2)) / np.float32(d_model))
        return pos * angles

    def call(self, inputs):
        seq_length = inputs.shape.as_list()[-2]
        d_model = inputs.shape.as_list()[-1]
        angles = self.get_angles(np.arange(seq_length)[:, np.newaxis],
                                 np.arange(d_model)[np.newaxis, :],
                                 d_model)
        angles[:, 0::2] = np.sin(angles[:, 0::2])
        angles[:, 1::2] = np.cos(angles[:, 1::2])
        pos_encoding = angles[np.newaxis, ...]
        return inputs + tf.cast(pos_encoding, tf.float32)

### Attention 

In [32]:
def scaled_dot_product_attention(queries, keys, values, mask):
    product = tf.matmul(queries, keys, transpose_b=True)
    
    keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)
    scaled_product = product / tf.math.sqrt(keys_dim)
    
    if mask is not None:
        scaled_product += (mask * -1e9)
    
    attention = tf.matmul(tf.nn.softmax(scaled_product, axis=-1), values)
    
    return attention

### Multi-head attention sublayer

In [33]:
class MultiHeadAttention(layers.Layer):
    
    def __init__(self, nb_proj):
        super(MultiHeadAttention, self).__init__()
        self.nb_proj = nb_proj
        
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        assert self.d_model % self.nb_proj == 0
        
        self.d_proj = self.d_model // self.nb_proj
        
        self.query_lin = layers.Dense(units=self.d_model)
        self.key_lin = layers.Dense(units=self.d_model)
        self.value_lin = layers.Dense(units=self.d_model)
        
        self.final_lin = layers.Dense(units=self.d_model)
        
    def split_proj(self, inputs, batch_size): # inputs: (batch_size, seq_length, d_model)
        shape = (batch_size,
                 -1,
                 self.nb_proj,
                 self.d_proj)
        splited_inputs = tf.reshape(inputs, shape=shape) # (batch_size, seq_length, nb_proj, d_proj)
        return tf.transpose(splited_inputs, perm=[0, 2, 1, 3]) # (batch_size, nb_proj, seq_length, d_proj)
    
    def call(self, queries, keys, values, mask):
        batch_size = tf.shape(queries)[0]
        
        queries = self.query_lin(queries)
        keys = self.key_lin(keys)
        values = self.value_lin(values)
        
        queries = self.split_proj(queries, batch_size)
        keys = self.split_proj(keys, batch_size)
        values = self.split_proj(values, batch_size)
        
        attention = scaled_dot_product_attention(queries, keys, values, mask)
        
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        
        concat_attention = tf.reshape(attention,
                                      shape=(batch_size, -1, self.d_model))
        
        outputs = self.final_lin(concat_attention)
        
        return outputs

### Encoder

In [34]:
class EncoderLayer(layers.Layer):
    
    def __init__(self, FFN_units, nb_proj, dropout_rate):
        super(EncoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout_rate = dropout_rate
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        
        self.multi_head_attention = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        
        self.dense_1 = layers.Dense(units=self.FFN_units, activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs, mask, training):
        attention = self.multi_head_attention(inputs,
                                              inputs,
                                              inputs,
                                              mask)
        attention = self.dropout_1(attention, training=training)
        attention = self.norm_1(attention + inputs)
        
        outputs = self.dense_1(attention)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_2(outputs, training=training)
        outputs = self.norm_2(outputs + attention)
        
        return outputs

In [35]:
class Encoder(layers.Layer):
    
    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout_rate,
                 vocab_size,
                 d_model,
                 name="encoder"):
        super(Encoder, self).__init__(name=name)
        self.nb_layers = nb_layers
        self.d_model = d_model
        
        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout_rate)
        self.enc_layers = [EncoderLayer(FFN_units,
                                        nb_proj,
                                        dropout_rate) 
                           for _ in range(nb_layers)]
    
    def call(self, inputs, mask, training):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)
        
        for i in range(self.nb_layers):
            outputs = self.enc_layers[i](outputs, mask, training)

        return outputs

### Decoder

In [36]:
class DecoderLayer(layers.Layer):
    
    def __init__(self, FFN_units, nb_proj, dropout_rate):
        super(DecoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout_rate = dropout_rate
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        
        # Self multi head attention
        self.multi_head_attention_1 = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        
        # Multi head attention combined with encoder output
        self.multi_head_attention_2 = MultiHeadAttention(self.nb_proj)
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
        
        # Feed foward
        self.dense_1 = layers.Dense(units=self.FFN_units,
                                    activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_3 = layers.Dropout(rate=self.dropout_rate)
        self.norm_3 = layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        attention = self.multi_head_attention_1(inputs,
                                                inputs,
                                                inputs,
                                                mask_1)
        attention = self.dropout_1(attention, training)
        attention = self.norm_1(attention + inputs)
        
        attention_2 = self.multi_head_attention_2(attention,
                                                  enc_outputs,
                                                  enc_outputs,
                                                  mask_2)
        attention_2 = self.dropout_2(attention_2, training)
        attention_2 = self.norm_2(attention_2 + attention)
        
        outputs = self.dense_1(attention_2)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_3(outputs, training)
        outputs = self.norm_3(outputs + attention_2)
        
        return outputs

In [37]:
class Decoder(layers.Layer):
    
    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout_rate,
                 vocab_size,
                 d_model,
                 name="decoder"):
        super(Decoder, self).__init__(name=name)
        self.d_model = d_model
        self.nb_layers = nb_layers
        
        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout_rate)
        
        self.dec_layers = [DecoderLayer(FFN_units,
                                        nb_proj,
                                        dropout_rate) 
                           for i in range(nb_layers)]
    
    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)
        
        for i in range(self.nb_layers):
            outputs = self.dec_layers[i](outputs,
                                         enc_outputs,
                                         mask_1,
                                         mask_2,
                                         training)

        return outputs

### Transformer

In [38]:
class Transformer(tf.keras.Model):
    
    def __init__(self,
                 vocab_size_enc,
                 vocab_size_dec,
                 d_model,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout_rate,
                 name="transformer"):
        super(Transformer, self).__init__(name=name)
        
        self.encoder = Encoder(nb_layers,
                               FFN_units,
                               nb_proj,
                               dropout_rate,
                               vocab_size_enc,
                               d_model)
        self.decoder = Decoder(nb_layers,
                               FFN_units,
                               nb_proj,
                               dropout_rate,
                               vocab_size_dec,
                               d_model)
        self.last_linear = layers.Dense(units=vocab_size_dec, name="lin_ouput")
    
    def create_padding_mask(self, seq):
        mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
        return mask[:, tf.newaxis, tf.newaxis, :]

    def create_look_ahead_mask(self, seq):
        seq_len = tf.shape(seq)[1]
        look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return look_ahead_mask
    
    def call(self, enc_inputs, dec_inputs, training):
        enc_mask = self.create_padding_mask(enc_inputs)
        dec_mask_1 = tf.maximum(
            self.create_padding_mask(dec_inputs),
            self.create_look_ahead_mask(dec_inputs)
        )
        dec_mask_2 = self.create_padding_mask(enc_inputs)
        
        enc_outputs = self.encoder(enc_inputs, enc_mask, training)
        dec_outputs = self.decoder(dec_inputs,
                                   enc_outputs,
                                   dec_mask_1,
                                   dec_mask_2,
                                   training)
        
        outputs = self.last_linear(dec_outputs)
        
        return outputs

## Training

In [39]:
tf.keras.backend.clear_session()

# Hyper-parameters
D_MODEL = 128 # 512
NB_LAYERS = 4 # 6
FFN_UNITS = 512 # 2048
NB_PROJ = 8 # 8
DROPOUT_RATE = 0.1 # 0.1

transformer = Transformer(vocab_size_enc=VOCAB_SIZE_QST,
                          vocab_size_dec=VOCAB_SIZE_ANS,
                          d_model=D_MODEL,
                          nb_layers=NB_LAYERS,
                          FFN_units=FFN_UNITS,
                          nb_proj=NB_PROJ,
                          dropout_rate=DROPOUT_RATE)

In [40]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction="none")

def loss_function(target, pred):
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    loss_ = loss_object(target, pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

train_loss = tf.keras.metrics.Mean(name="train_loss")
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="train_accuracy")

In [41]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)
        
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

leaning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(leaning_rate,
                                     beta_1=0.9,
                                     beta_2=0.98,
                                     epsilon=1e-9)
        

In [42]:
checkpoint_path = "ckpt/"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [43]:
EPOCHS = 10
for epoch in range(EPOCHS):
    print("Start of epoch {}".format(epoch+1))
    start = time.time()
    
    train_loss.reset_states()
    train_accuracy.reset_states()
    
    for (batch, (enc_inputs, targets)) in enumerate(dataset):
        dec_inputs = targets[:, :-1]
        dec_outputs_real = targets[:, 1:]
        with tf.GradientTape() as tape:
            predictions = transformer(enc_inputs, dec_inputs, True)
            loss = loss_function(dec_outputs_real, predictions)
        
        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
        
        train_loss(loss)
        train_accuracy(dec_outputs_real, predictions)
        
        if batch % 50 == 0:
            print("Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}".format(
                epoch+1, batch, train_loss.result(), train_accuracy.result()))
            
    ckpt_save_path = ckpt_manager.save()
    print("Saving checkpoint for epoch {} at {}".format(epoch+1,
                                                        ckpt_save_path))
    print("Time taken for 1 epoch: {} secs\n".format(time.time() - start))

Start of epoch 1
Epoch 1 Batch 0 Loss 3.2165 Accuracy 0.0000
Epoch 1 Batch 50 Loss 3.8895 Accuracy 0.0118
Epoch 1 Batch 100 Loss 3.8377 Accuracy 0.0320
Epoch 1 Batch 150 Loss 3.8086 Accuracy 0.0389
Epoch 1 Batch 200 Loss 3.7606 Accuracy 0.0423
Epoch 1 Batch 250 Loss 3.6878 Accuracy 0.0443
Epoch 1 Batch 300 Loss 3.6177 Accuracy 0.0457
Epoch 1 Batch 350 Loss 3.5276 Accuracy 0.0467
Epoch 1 Batch 400 Loss 3.4452 Accuracy 0.0474
Epoch 1 Batch 450 Loss 3.3655 Accuracy 0.0480
Epoch 1 Batch 500 Loss 3.3015 Accuracy 0.0490
Epoch 1 Batch 550 Loss 3.2460 Accuracy 0.0505
Epoch 1 Batch 600 Loss 3.1905 Accuracy 0.0525
Epoch 1 Batch 650 Loss 3.1419 Accuracy 0.0548
Epoch 1 Batch 700 Loss 3.0934 Accuracy 0.0571
Epoch 1 Batch 750 Loss 3.0472 Accuracy 0.0595
Epoch 1 Batch 800 Loss 3.0049 Accuracy 0.0617
Epoch 1 Batch 850 Loss 2.9638 Accuracy 0.0637
Epoch 1 Batch 900 Loss 2.9272 Accuracy 0.0655
Epoch 1 Batch 950 Loss 2.8922 Accuracy 0.0673
Epoch 1 Batch 1000 Loss 2.8614 Accuracy 0.0689
Epoch 1 Batch 1050 

Epoch 4 Batch 1850 Loss 1.9109 Accuracy 0.1238
Epoch 4 Batch 1900 Loss 1.9112 Accuracy 0.1238
Epoch 4 Batch 1950 Loss 1.9118 Accuracy 0.1238
Epoch 4 Batch 2000 Loss 1.9120 Accuracy 0.1239
Epoch 4 Batch 2050 Loss 1.9127 Accuracy 0.1240
Epoch 4 Batch 2100 Loss 1.9127 Accuracy 0.1240
Epoch 4 Batch 2150 Loss 1.9132 Accuracy 0.1240
Saving checkpoint for epoch 4 at ckpt/ckpt-4
Time taken for 1 epoch: 3612.9534027576447 secs

Start of epoch 5
Epoch 5 Batch 0 Loss 2.1080 Accuracy 0.1308
Epoch 5 Batch 50 Loss 1.9074 Accuracy 0.1253
Epoch 5 Batch 100 Loss 1.8713 Accuracy 0.1254
Epoch 5 Batch 150 Loss 1.8663 Accuracy 0.1257
Epoch 5 Batch 200 Loss 1.8746 Accuracy 0.1265
Epoch 5 Batch 250 Loss 1.8693 Accuracy 0.1264
Epoch 5 Batch 300 Loss 1.8710 Accuracy 0.1265
Epoch 5 Batch 350 Loss 1.8729 Accuracy 0.1262
Epoch 5 Batch 400 Loss 1.8780 Accuracy 0.1263
Epoch 5 Batch 450 Loss 1.8807 Accuracy 0.1261
Epoch 5 Batch 500 Loss 1.8843 Accuracy 0.1261
Epoch 5 Batch 550 Loss 1.8815 Accuracy 0.1261
Epoch 5 Bat

Epoch 8 Batch 1400 Loss 1.7889 Accuracy 0.1326
Epoch 8 Batch 1450 Loss 1.7894 Accuracy 0.1326
Epoch 8 Batch 1500 Loss 1.7892 Accuracy 0.1326
Epoch 8 Batch 1550 Loss 1.7886 Accuracy 0.1326
Epoch 8 Batch 1600 Loss 1.7901 Accuracy 0.1326
Epoch 8 Batch 1650 Loss 1.7916 Accuracy 0.1327
Epoch 8 Batch 1700 Loss 1.7910 Accuracy 0.1327
Epoch 8 Batch 1750 Loss 1.7920 Accuracy 0.1328
Epoch 8 Batch 1800 Loss 1.7926 Accuracy 0.1328
Epoch 8 Batch 1850 Loss 1.7924 Accuracy 0.1328
Epoch 8 Batch 1900 Loss 1.7932 Accuracy 0.1328
Epoch 8 Batch 1950 Loss 1.7933 Accuracy 0.1328
Epoch 8 Batch 2000 Loss 1.7935 Accuracy 0.1327
Epoch 8 Batch 2050 Loss 1.7945 Accuracy 0.1328
Epoch 8 Batch 2100 Loss 1.7947 Accuracy 0.1328
Epoch 8 Batch 2150 Loss 1.7956 Accuracy 0.1329
Saving checkpoint for epoch 8 at ckpt/ckpt-8
Time taken for 1 epoch: 3750.725405216217 secs

Start of epoch 9
Epoch 9 Batch 0 Loss 1.8427 Accuracy 0.1464
Epoch 9 Batch 50 Loss 1.7693 Accuracy 0.1342
Epoch 9 Batch 100 Loss 1.7726 Accuracy 0.1350
Epo

## Evaluate

In [44]:
def evaluate(inp_sentence):
    inp_sentence = \
        [VOCAB_SIZE_QST-2] + tokenizer_qst.encode(inp_sentence) + [VOCAB_SIZE_QST-1]
    enc_input = tf.expand_dims(inp_sentence, axis=0)
    
    output = tf.expand_dims([VOCAB_SIZE_ANS-2], axis=0)
    
    for _ in range(MAX_LENGTH):
        predictions = transformer(enc_input, output, False)
        
        prediction = predictions[:, -1:, :]
        
        predicted_id = tf.cast(tf.argmax(prediction, axis=-1), tf.int32)
        
        if predicted_id == VOCAB_SIZE_ANS-1:
            return tf.squeeze(output, axis=0)
        
        output = tf.concat([output, predicted_id], axis=-1)
        
    return tf.squeeze(output, axis=0)

In [58]:
def chatbot(sentence):
    output = evaluate(sentence).numpy()
    
    predicted_sentence = tokenizer_ans.decode(
        [i for i in output if i < VOCAB_SIZE_ANS-2]
    )
    
    print("Input: {}".format(sentence))
    print("Predicted reply: {}".format(predicted_sentence))

## Chatbot

In [59]:
chatbot("hi")

Input: hi
Predicted reply: i am not going to be here until i get back


In [77]:
chatbot("what is your name")

Input: what is your name
Predicted reply: i am not going to be here
