<a href="https://colab.research.google.com/github/anthonyhughes/chatbot_sequence2sequence/blob/main/seq_2_seq_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install all neccessary resources dependencies

In [60]:
!pip install tensorflow



In [61]:
!pip install tensorlayer



In [62]:
!pip install numpy



In [63]:
import tensorflow as tf
import tensorlayer as tl
import numpy as np
from tensorlayer.cost import cross_entropy_seq, cross_entropy_seq_with_mask
from tqdm import tqdm
from sklearn.utils import shuffle
from tensorlayer.models.seq2seq import Seq2seq
from tensorlayer.models.seq2seq_with_attention import Seq2seqLuongAttention
import os
import pickle

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [64]:
PATH='drive/MyDrive/wvh_cl_portfolio_chatbot/'

def load_data():
    # read data control dictionaries
    try:
        with open(PATH + 'metadata.pkl', 'rb') as f:
            metadata = pickle.load(f)
    except Exception as e:
      print(e)
      metadata = None
    # read numpy arrays
    idx_q = np.load(PATH + 'idx_q.npy')
    idx_a = np.load(PATH + 'idx_a.npy')
    return metadata, idx_q, idx_a

metadata, idx_q, idx_a = load_data()

In [10]:
'''
    Function for creating train, test and validation split
    Returns the X and Y variables as a tuple( (trainX, trainY), (testX,testY))
'''
def split_dataset(x, y, ratio = [0.7, 0.3] ):
    # number of examples
    data_len = len(x)
    # get the length for set [length for train, length for test]
    lens = [int(data_len * item) for item in ratio]
    # create the data splits necessary for training and testing
    trainX, trainY = x[:lens[0]], y[:lens[0]]
    testX, testY = x[lens[0]:lens[0]+lens[1]], y[lens[0]:lens[0]+lens[1]]

    return (trainX,trainY), (testX,testY)

In [11]:
def initial_setup():
    metadata, idx_q, idx_a = load_data()
    (trainX, trainY), (testX, testY) = split_dataset(idx_q, idx_a)
    trainX = tl.prepro.remove_pad_sequences(trainX.tolist())
    trainY = tl.prepro.remove_pad_sequences(trainY.tolist())
    testX = tl.prepro.remove_pad_sequences(testX.tolist())
    testY = tl.prepro.remove_pad_sequences(testY.tolist())
    return metadata, trainX, trainY, testX, testY
    
#data preprocessing
metadata, trainX, trainY, testX, testY = initial_setup()

In [65]:
# Parameters
src_len = len(trainX)
tgt_len = len(trainY)

assert src_len == tgt_len

In [66]:
batch_size = 32
n_step = src_len
src_vocab_size = len(metadata['idx2w'])
emb_dim = 1024

In [67]:
word2idx = metadata['w2idx']   # dict  word 2 index
idx2word = metadata['idx2w']   # list index 2 word
unk_id = word2idx['unk']   # 1
pad_id = word2idx['_']

In [68]:
start_id = src_vocab_size 
end_id = src_vocab_size + 1 

In [80]:
word2idx.update({'start_id': start_id})
word2idx.update({'end_id': end_id})
idx2word = idx2word + ['start_id', 'end_id']
src_vocab_size = tgt_vocab_size = src_vocab_size + 2
num_epochs = 50
vocabulary_size = src_vocab_size

In [81]:
decoder_seq_length = 20
model_ = Seq2seq(
        decoder_seq_length = decoder_seq_length,
        cell_enc=tf.keras.layers.GRUCell,
        cell_dec=tf.keras.layers.GRUCell,
        n_layer=3,
        n_units=256,
        embedding_layer=tl.layers.Embedding(vocabulary_size=vocabulary_size, embedding_size=emb_dim),
        )

[TL] Embedding embedding_6: (8008, 1024)
[TL] RNN rnn_31: cell: GRUCell, n_units: 256
[TL] RNN rnn_32: cell: GRUCell, n_units: 256
[TL] RNN rnn_33: cell: GRUCell, n_units: 256
[TL] RNN rnn_34: cell: GRUCell, n_units: 256
[TL] RNN rnn_35: cell: GRUCell, n_units: 256
[TL] RNN rnn_36: cell: GRUCell, n_units: 256
[TL] Reshape reshape_16
[TL] Dense  dense_6: 8008 No Activation
[TL] Reshape reshape_17
[TL] Reshape reshape_18


In [29]:
def train_chatbot(trainX, trainY, model):
      for epoch in range(num_epochs):
        model.train()
        trainX, trainY = shuffle(trainX, trainY, random_state=0)
        total_loss, n_iter = 0, 0

        for X, Y in tqdm(tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=batch_size, shuffle=False), 
                        total=n_step, desc='Epoch[{}/{}]'.format(epoch + 1, num_epochs), leave=False):

            # remove all padding (reduce sequential zeros to one zero)
            X = tl.prepro.pad_sequences(X)
            # create target encoded sequences for masking
            target_sequences = tl.prepro.sequences_add_end_id(Y, end_id=end_id)
            target_sequences = tl.prepro.pad_sequences(target_sequences, maxlen=decoder_seq_length)
            # create decode sequences which realte to the encoded ones
            decode_sequences = tl.prepro.sequences_add_start_id(Y, start_id=start_id, remove_last=False)
            decode_sequences = tl.prepro.pad_sequences(decode_sequences, maxlen=decoder_seq_length)
            # Generate a mask for a set of sequences
            target_mask = tl.prepro.sequences_get_mask(target_sequences)

            with tf.GradientTape() as tape:
                ## compute outputs
                output = model(inputs = [X, decode_sequences])
                
                # reshape output into  chape for caluclating the loss
                output = tf.reshape(output, [-1, vocabulary_size])

                ## compute loss and update model
                loss = cross_entropy_seq_with_mask(logits=output, target_seqs=_target_sequences, input_mask=target_mask)

                grad = tape.gradient(loss, model.all_weights)
                optimizer.apply_gradients(zip(grad, model.all_weights))
            
            total_loss += loss
            n_iter += 1

        # printing average loss after every epoch
        print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, num_epochs, total_loss / n_iter))
      return model

In [30]:
model_ = train_chatbot(trainX, trainY, model_)



Epoch [1/16]: loss 5.6222




Epoch [2/16]: loss 4.9761




Epoch [3/16]: loss 4.7179




Epoch [4/16]: loss 4.5122




Epoch [5/16]: loss 4.3225




Epoch [6/16]: loss 4.1413




Epoch [7/16]: loss 3.9706




Epoch [8/16]: loss 3.8097




Epoch [9/16]: loss 3.6613




Epoch [10/16]: loss 3.5287




Epoch [11/16]: loss 3.4112




Epoch [12/16]: loss 3.3088




Epoch [13/16]: loss 3.2179




Epoch [14/16]: loss 3.1417




Epoch [15/16]: loss 3.0753


                                                                 

Epoch [16/16]: loss 3.0166




In [88]:
def inference(seed, top_n, model):
    model.eval()
    seed_id = [word2idx.get(w, unk_id) for w in seed.split(" ")]
    sentence_id = model(inputs=[[seed_id]], seq_length=20, start_token=start_id, top_n = top_n)
    sentence = []
    for w_id in sentence_id[0]:
        w = idx2word[w_id]
        if w == 'end_id':
            break
        sentence = sentence + [w]
    return sentence

In [31]:
def save_model(model):
  tl.files.save_npz(model.all_weights, name=PATH + 'model.npz')

save_model(model_)

[TL] [*] Saving TL weights into drive/MyDrive/wvh_cl_portfolio_chatbot/model.npz
[TL] [*] Saved


  return array(a, dtype, copy=False, order=order, subok=True)


In [82]:
optimizer = tf.optimizers.Adam(learning_rate=0.001)
model_.train()

In [None]:
load_weights = tl.files.load_npz(name=PATH + 'model.npz')
tl.files.assign_weights(load_weights, model_)

In [104]:
seeds = ["who do you think you are?"]
# Take the 3 generated respones from the model
for seed in seeds:
    print("Query >", seed)
    top_n = 3
    for i in range(top_n):
        sentence = inference(seed, top_n, model_)
        print(" >", ' '.join(sentence))

Query > who do you think you are?
 > monica you can do it tomorrow
 > monica because i won the debate
 > monica and i think paul is a big boy
