In [1]:
import tensorflow as tf 
import numpy as np
import pickle
import os
import time
import re
from pathlib import Path

In [2]:
# load up the good stuff
conversation_dict = np.load(Path("data") / "conversation_dict.npy").item()
with open(Path("data") / "word_list.txt", "rb") as f:
    word_list = pickle.load(f)

In [3]:
# define constants here
max_length = 20
batch_size = 32
vocab_size = len(word_list)
embedding_dim = 256
hidden_units = 1024
attention_units = 16
epochs = 32

In [4]:
def make_train():
    # make x_train and y_train

    # setup empty ones to be filled
    dataset_size = len(conversation_dict)
    x_train = np.zeros((dataset_size, max_length), dtype='int32')
    y_train = np.zeros((dataset_size, max_length), dtype='int32')

    for index,(k,v) in enumerate(conversation_dict.items()):
        # dummy arrays to be filled
        in_msg = np.full((max_length), word_list.index('<padding>'), dtype='int32')
        out_msg = np.full((max_length), word_list.index('<padding>'), dtype='int32')

        # split up the words
        in_words = k.split()[:max_length-1]
        out_words = v.split()[:max_length-1]

        # throw out empty ones
        if (len(in_words) == 0 or len(out_words) == 0):
            continue

        # integerize the strings
        for i, word in enumerate(in_words):
            in_msg[i] = word_list.index(word)
        in_msg[i+1] = word_list.index("<eos>")

        for i, word in enumerate(out_words):
            out_msg[i] = word_list.index(word)
        out_msg[i+1] = word_list.index("<eos>")

        x_train[index] = in_msg
        y_train[index] = out_msg

    # remove completely 0's lines
    x_train = x_train[~np.all(x_train == 0, axis=1)]
    y_train = y_train[~np.all(y_train == 0, axis=1)]
    return x_train, y_train

In [5]:
data_path = Path("data") / "train.npz"
if data_path.exists():
    with open(data_path, "rb") as f:
        archive = np.load(f)
        x_train, y_train = archive["x_train"], archive["y_train"]
else: 
    x_train, y_train = make_train()
    np.savez(data_path, x_train=x_train, y_train=y_train)

In [6]:
dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(len(x_train))
dataset = dataset.batch(batch_size, drop_remainder=True)

In [7]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [8]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, hidden_size)
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [9]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [10]:
encoder = Encoder(vocab_size, embedding_dim, hidden_units, batch_size)
decoder = Decoder(vocab_size, embedding_dim, hidden_units, batch_size)

In [11]:
optimizer = tf.optimizers.Adam()
loss_object = lambda x, y: tf.keras.losses.sparse_categorical_crossentropy(x, y, 
                                                                           from_logits=True)
def loss_function(real, pred):
    # ignore 0's
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [12]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [13]:
checkpoint.restore(checkpoint_dir + "/ckpt-16")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2357ff9e860>

In [111]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([word_list.index("<padding>")] * batch_size, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [97]:
steps_per_epoch = len(x_train) // batch_size
for epoch in range(epochs):
    start = time.time()
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 9.0677
Epoch 1 Batch 100 Loss 3.6531
Epoch 1 Batch 200 Loss 3.0089
Epoch 1 Batch 300 Loss 2.4625
Epoch 1 Batch 400 Loss 3.1726
Epoch 1 Loss 3.1939
Time taken for 1 epoch 157.89947128295898 sec

Epoch 2 Batch 0 Loss 3.2448
Epoch 2 Batch 100 Loss 3.2879
Epoch 2 Batch 200 Loss 2.7652
Epoch 2 Batch 300 Loss 2.2220
Epoch 2 Batch 400 Loss 2.8885
Epoch 2 Loss 2.7675
Time taken for 1 epoch 112.83490824699402 sec

Epoch 3 Batch 0 Loss 3.0304
Epoch 3 Batch 100 Loss 3.0578
Epoch 3 Batch 200 Loss 2.6307
Epoch 3 Batch 300 Loss 2.0918
Epoch 3 Batch 400 Loss 2.7009
Epoch 3 Loss 2.6155
Time taken for 1 epoch 109.4363489151001 sec

Epoch 4 Batch 0 Loss 2.9882
Epoch 4 Batch 100 Loss 2.9279
Epoch 4 Batch 200 Loss 2.5166
Epoch 4 Batch 300 Loss 1.9824
Epoch 4 Batch 400 Loss 2.5381
Epoch 4 Loss 2.4909
Time taken for 1 epoch 123.34249019622803 sec

Epoch 5 Batch 0 Loss 2.7588
Epoch 5 Batch 100 Loss 2.7657
Epoch 5 Batch 200 Loss 2.3912
Epoch 5 Batch 300 Loss 1.8846
Epoch 5 Batch 400 Loss 

In [14]:
def preprocess_sentence(msg):
    msg = msg.replace('\n', ' ').lower()
    msg = msg.replace("\xc2\xa0", "")
    msg = re.sub('([\(\).,!?])', "", msg)
    msg = re.sub(" +"," ", msg)
    return msg

def softmax_choose(a, weights):
    exps = np.exp(weights - np.max(weights))
    scaled_exps = exps / np.sum(exps)
    return np.random.choice(a, p=scaled_exps)

In [15]:
class Bot:
    def __init__(self, word_list, encoder, decoder):
        self.encoder = encoder
        self.decoder = decoder
        self.word_list = word_list
        
    def encode(self, sentence):
        inputs = [self.word_list.index(i) for i in sentence.split(' ') if i in self.word_list]
        inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                               maxlen=max_length,
                                                               padding='post')
        inputs = tf.convert_to_tensor(inputs)
        return inputs

    def predict(self, inputs):
        result = []

        hidden = [tf.zeros((1, hidden_units))]
        enc_out, enc_hidden = self.encoder(inputs, hidden)

        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([self.word_list.index('<padding>')], 0)

        for t in range(max_length):
            predictions, dec_hidden, attention_weights = self.decoder(dec_input,
                                                                 dec_hidden,
                                                                 enc_out)
            options = 8 if t == 0 else 1
            top_prediction_indices = tf.argsort(predictions[0])[::-1][:options]
            top_confidences = []
            for index in top_prediction_indices:
                top_confidences.append(float(predictions[0][index]))
            top_confidences = np.array(top_confidences)
            predicted_id = softmax_choose(top_prediction_indices, top_confidences)

            result.append(predicted_id)
            if self.word_list[predicted_id] == '<eos>':
                return result

            # the predicted ID is fed back into the model
            dec_input = tf.expand_dims([predicted_id], 0)

        return result

    def decode(self, outputs):
        return " ".join([self.word_list[w] for w in outputs if self.word_list[w] != "<eos>" and self.word_list[w] != "<padding>"])

    def evaluate(self, sentence):
        sentence = preprocess_sentence(sentence)
        inputs = self.encode(sentence)
        outputs = self.predict(inputs)
        response = self.decode(outputs)

        return sentence, response

In [16]:
AlexBot = Bot(word_list, encoder, decoder)

In [115]:
AlexBot.evaluate("hi")

('hi', 'i love you ❤️')

In [42]:
i = 304
print(decode(x_train[i]))
print(decode(y_train[i]))

i got those later and sure sounds great
aw ok we’re really having a hard time spending time together tho aren’t we


## Experiment

Here is the reversed Alex Botexperiment...

In [17]:
# load up the good stuff
conversation_dict = np.load(Path("data") / "conversation_dict_reverse.npy").item()
with open(Path("data") / "word_list_reverse.txt", "rb") as f:
    word_list = pickle.load(f)

In [18]:
def make_train():
    # make x_train and y_train

    # setup empty ones to be filled
    dataset_size = len(conversation_dict)
    x_train = np.zeros((dataset_size, max_length), dtype='int32')
    y_train = np.zeros((dataset_size, max_length), dtype='int32')

    for index,(k,v) in enumerate(conversation_dict.items()):
        # dummy arrays to be filled
        in_msg = np.full((max_length), word_list.index('<padding>'), dtype='int32')
        out_msg = np.full((max_length), word_list.index('<padding>'), dtype='int32')

        # split up the words
        in_words = k.split()[:max_length-1]
        out_words = v.split()[:max_length-1]

        # throw out empty ones
        if (len(in_words) == 0 or len(out_words) == 0):
            continue

        # integerize the strings
        for i, word in enumerate(in_words):
            in_msg[i] = word_list.index(word)
        in_msg[i+1] = word_list.index("<eos>")

        for i, word in enumerate(out_words):
            out_msg[i] = word_list.index(word)
        out_msg[i+1] = word_list.index("<eos>")

        x_train[index] = in_msg
        y_train[index] = out_msg

    # remove completely 0's lines
    x_train = x_train[~np.all(x_train == 0, axis=1)]
    y_train = y_train[~np.all(y_train == 0, axis=1)]
    return x_train, y_train

In [19]:
data_path = Path("data") / "train_reverse.npz"
if data_path.exists():
    with open(data_path, "rb") as f:
        archive = np.load(f)
        x_train, y_train = archive["x_train"], archive["y_train"]
else: 
    x_train, y_train = make_train()
    np.savez(data_path, x_train=x_train, y_train=y_train)

In [20]:
dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(len(x_train))
dataset = dataset.batch(batch_size, drop_remainder=True)

vocab_size = len(word_list)

In [21]:
encoder = Encoder(vocab_size, embedding_dim, hidden_units, batch_size)
decoder = Decoder(vocab_size, embedding_dim, hidden_units, batch_size)

In [22]:
optimizer = tf.optimizers.Adam()
loss_object = lambda x, y: tf.keras.losses.sparse_categorical_crossentropy(x, y, 
                                                                           from_logits=True)
def loss_function(real, pred):
    # ignore 0's
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [23]:
checkpoint_dir = './reverse_training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [24]:
checkpoint.restore(checkpoint_dir + "/ckpt-16")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2355e75c9e8>

In [125]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([word_list.index("<padding>")] * batch_size, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [42]:
steps_per_epoch = len(x_train) // batch_size
for epoch in range(epochs):
    start = time.time()
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 3.4804
Epoch 1 Batch 100 Loss 2.9065
Epoch 1 Batch 200 Loss 3.1111
Epoch 1 Batch 300 Loss 2.5116
Epoch 1 Batch 400 Loss 2.6553
Epoch 1 Loss 2.8190
Time taken for 1 epoch 113.67062044143677 sec

Epoch 2 Batch 0 Loss 2.4899
Epoch 2 Batch 100 Loss 2.6109
Epoch 2 Batch 200 Loss 2.8259
Epoch 2 Batch 300 Loss 2.2920
Epoch 2 Batch 400 Loss 2.4332
Epoch 2 Loss 2.5038
Time taken for 1 epoch 117.27589726448059 sec

Epoch 3 Batch 0 Loss 2.2936
Epoch 3 Batch 100 Loss 2.4329
Epoch 3 Batch 200 Loss 2.6602
Epoch 3 Batch 300 Loss 2.2026
Epoch 3 Batch 400 Loss 2.3040
Epoch 3 Loss 2.3616
Time taken for 1 epoch 116.63586235046387 sec

Epoch 4 Batch 0 Loss 2.1633
Epoch 4 Batch 100 Loss 2.3184
Epoch 4 Batch 200 Loss 2.5264
Epoch 4 Batch 300 Loss 2.1084
Epoch 4 Batch 400 Loss 2.1919
Epoch 4 Loss 2.2477
Time taken for 1 epoch 127.47735953330994 sec

Epoch 5 Batch 0 Loss 2.0394
Epoch 5 Batch 100 Loss 2.1986
Epoch 5 Batch 200 Loss 2.3757
Epoch 5 Batch 300 Loss 2.0080
Epoch 5 Batch 400 Loss

In [126]:
OtherBot = Bot(word_list, encoder, decoder)

In [131]:
OtherBot.evaluate("oh well drawing and painting has a teacher around")

('oh well drawing and painting has a teacher around',
 "don't think i could teleport that")

In [132]:
AlexBot.evaluate("hi")

('hi',
 'know when he not 2 years and a lot and when he not 2 years and a lot and when')

In [64]:
i = 120
print(decode(x_train[i]))

print(decode(y_train[i]))

oh well drawing and painting has a teacher around
you noticed that okay xd


<__main__.Encoder at 0x2355e9c4d30>