In [1]:
import tensorflow as tf 
import numpy as np
import pickle
import os
import time
import re
from pathlib import Path

In [2]:
# load up the good stuff
conversation_dict = np.load(Path("data") / "conversation_dict.npy").item()
with open(Path("data") / "word_list.txt", "rb") as f:
    word_list = pickle.load(f)

In [3]:
# define constants here
max_length = 20
batch_size = 32
vocab_size = len(word_list)
embedding_dim = 256
hidden_units = 1024
attention_units = 16
epochs = 32

In [4]:
def make_train():
    # make x_train and y_train

    # setup empty ones to be filled
    dataset_size = len(conversation_dict)
    x_train = np.zeros((dataset_size, max_length), dtype='int32')
    y_train = np.zeros((dataset_size, max_length), dtype='int32')

    for index,(k,v) in enumerate(conversation_dict.items()):
        # dummy arrays to be filled
        in_msg = np.full((max_length), word_list.index('<padding>'), dtype='int32')
        out_msg = np.full((max_length), word_list.index('<padding>'), dtype='int32')

        # split up the words
        in_words = k.split()[:max_length-1]
        out_words = v.split()[:max_length-1]

        # throw out empty ones
        if (len(in_words) == 0 or len(out_words) == 0):
            continue

        # integerize the strings
        for i, word in enumerate(in_words):
            in_msg[i] = word_list.index(word)
        in_msg[i+1] = word_list.index("<eos>")

        for i, word in enumerate(out_words):
            out_msg[i] = word_list.index(word)
        out_msg[i+1] = word_list.index("<eos>")

        x_train[index] = in_msg
        y_train[index] = out_msg

    # remove completely 0's lines
    x_train = x_train[~np.all(x_train == 0, axis=1)]
    y_train = y_train[~np.all(y_train == 0, axis=1)]
    return x_train, y_train

In [5]:
data_path = Path("data") / "train.npz"
if data_path.exists():
    with open(data_path, "rb") as f:
        archive = np.load(f)
        x_train, y_train = archive["x_train"], archive["y_train"]
else: 
    x_train, y_train = make_train()
    np.savez(data_path, x_train=x_train, y_train=y_train)

In [6]:
dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(len(x_train))
dataset = dataset.batch(batch_size, drop_remainder=True)

In [7]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [8]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, hidden_size)
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [9]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [10]:
encoder = Encoder(vocab_size, embedding_dim, hidden_units, batch_size)
decoder = Decoder(vocab_size, embedding_dim, hidden_units, batch_size)

In [11]:
optimizer = tf.optimizers.Adam()
loss_object = lambda x, y: tf.keras.losses.sparse_categorical_crossentropy(x, y, 
                                                                           from_logits=True)
def loss_function(real, pred):
    # ignore 0's
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [12]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [13]:
checkpoint.restore(checkpoint_dir + "/ckpt-17")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x24e4abd1ef0>

In [14]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([word_list.index("<padding>")] * batch_size, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [15]:
steps_per_epoch = len(x_train) // batch_size
for epoch in range(epochs):
    start = time.time()
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.1523
Epoch 1 Batch 100 Loss 0.1163
Epoch 1 Batch 200 Loss 0.1835
Epoch 1 Batch 300 Loss 0.2604
Epoch 1 Batch 400 Loss 0.2600
Epoch 1 Loss 0.1850
Time taken for 1 epoch 165.42896008491516 sec

Epoch 2 Batch 0 Loss 0.1067
Epoch 2 Batch 100 Loss 0.1196
Epoch 2 Batch 200 Loss 0.1207
Epoch 2 Batch 300 Loss 0.1411
Epoch 2 Batch 400 Loss 0.1308
Epoch 2 Loss 0.1234
Time taken for 1 epoch 117.87940430641174 sec

Epoch 3 Batch 0 Loss 0.0787
Epoch 3 Batch 100 Loss 0.0696
Epoch 3 Batch 200 Loss 0.0593
Epoch 3 Batch 300 Loss 0.0942
Epoch 3 Batch 400 Loss 0.0824
Epoch 3 Loss 0.0766
Time taken for 1 epoch 109.97018623352051 sec

Epoch 4 Batch 0 Loss 0.0458
Epoch 4 Batch 100 Loss 0.0472
Epoch 4 Batch 200 Loss 0.0514
Epoch 4 Batch 300 Loss 0.0621
Epoch 4 Batch 400 Loss 0.0746
Epoch 4 Loss 0.0591
Time taken for 1 epoch 111.27615475654602 sec

Epoch 5 Batch 0 Loss 0.0520


KeyboardInterrupt: 

In [16]:
def preprocess_sentence(msg):
    msg = msg.replace('\n', ' ').lower()
    msg = msg.replace("\xc2\xa0", "")
    msg = re.sub('([\(\).,!?])', "", msg)
    msg = re.sub(" +"," ", msg)
    return msg

def softmax_choose(a, weights):
    exps = np.exp(weights - np.max(weights))
    scaled_exps = exps / np.sum(exps)
    return np.random.choice(a, p=scaled_exps)

In [23]:
def encode(sentence):
    inputs = [word_list.index(i) for i in sentence.split(' ') if i in word_list]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                           maxlen=max_length,
                                                           padding='post')
    inputs = tf.convert_to_tensor(inputs)
    return inputs

def predict(inputs):
    result = []

    hidden = [tf.zeros((1, hidden_units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([word_list.index('<padding>')], 0)

    for t in range(max_length):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)
        options = 8 if t == 0 else 1
        top_prediction_indices = tf.argsort(predictions[0])[::-1][:options]
        top_confidences = []
        for index in top_prediction_indices:
            top_confidences.append(float(predictions[0][index]))
        top_confidences = np.array(top_confidences)
        predicted_id = softmax_choose(top_prediction_indices, top_confidences)
        
        result.append(predicted_id)
        if word_list[predicted_id] == '<eos>':
            return result

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result

def decode(outputs):
    return " ".join([word_list[w] for w in outputs if word_list[w] != "<eos>" and word_list[w] != "<padding>"])

In [24]:
def evaluate(sentence):
    sentence = preprocess_sentence(sentence)
    inputs = encode(sentence)
    outputs = predict(inputs)
    response = decode(outputs)

    return sentence, response

In [54]:
evaluate("hi")

('hi', '❤️ that’s such a little hard to')

In [42]:
i = 304
print(decode(x_train[i]))
print(decode(y_train[i]))

i got those later and sure sounds great
aw ok we’re really having a hard time spending time together tho aren’t we
