<a href="https://colab.research.google.com/github/anthonyhughes/chatbot_sequence2sequence/blob/main/seq_2_seq_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import all neccessary resources

In [1]:
!pip install tensorflow



In [2]:
!pip install tensorlayer

Collecting tensorlayer
  Downloading tensorlayer-2.2.3-py3-none-any.whl (363 kB)
[K     |████████████████████████████████| 363 kB 4.7 MB/s 
Collecting progressbar2>=3.39.3
  Downloading progressbar2-4.0.0-py2.py3-none-any.whl (26 kB)
Collecting imageio>=2.5.0
  Downloading imageio-2.13.5-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 51.3 MB/s 
Collecting pillow>=8.3.2
  Downloading Pillow-9.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.3 MB)
[K     |████████████████████████████████| 4.3 MB 46.0 MB/s 
Installing collected packages: pillow, imageio, progressbar2, tensorlayer
  Attempting uninstall: pillow
    Found existing installation: Pillow 7.1.2
    Uninstalling Pillow-7.1.2:
      Successfully uninstalled Pillow-7.1.2
  Attempting uninstall: imageio
    Found existing installation: imageio 2.4.1
    Uninstalling imageio-2.4.1:
      Successfully uninstalled imageio-2.4.1
  Attempting uninstall: progressbar2
    Found existing insta

In [3]:
!pip install numpy



In [4]:
import tensorflow as tf
import tensorlayer as tl
import numpy as np
from tensorlayer.cost import cross_entropy_seq, cross_entropy_seq_with_mask
from tqdm import tqdm
from sklearn.utils import shuffle
from tensorlayer.models.seq2seq import Seq2seq
from tensorlayer.models.seq2seq_with_attention import Seq2seqLuongAttention
import os
import pickle

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
PATH='drive/MyDrive/wvh_cl_portfolio_chatbot/'

def load_data():
    # read data control dictionaries
    try:
        with open(PATH + 'metadata.pkl', 'rb') as f:
            metadata = pickle.load(f)
    except Exception as e:
      print(e)
      metadata = None
    # read numpy arrays
    idx_q = np.load(PATH + 'idx_q.npy')
    idx_a = np.load(PATH + 'idx_a.npy')
    return metadata, idx_q, idx_a

metadata, idx_q, idx_a = load_data()

In [10]:
'''
 split data into train (70%), test (15%) and valid(15%)
    return tuple( (trainX, trainY), (testX,testY), (validX,validY) )
'''
def split_dataset(x, y, ratio = [0.7, 0.15, 0.15] ):
    # number of examples
    data_len = len(x)
    lens = [ int(data_len*item) for item in ratio ]

    trainX, trainY = x[:lens[0]], y[:lens[0]]
    testX, testY = x[lens[0]:lens[0]+lens[1]], y[lens[0]:lens[0]+lens[1]]
    validX, validY = x[-lens[-1]:], y[-lens[-1]:]

    return (trainX,trainY), (testX,testY), (validX,validY)

split_set = split_dataset(idx_q, idx_a)

In [11]:
def initial_setup():
    metadata, idx_q, idx_a = load_data()
    (trainX, trainY), (testX, testY), (validX, validY) = split_dataset(idx_q, idx_a)
    trainX = tl.prepro.remove_pad_sequences(trainX.tolist())
    trainY = tl.prepro.remove_pad_sequences(trainY.tolist())
    testX = tl.prepro.remove_pad_sequences(testX.tolist())
    testY = tl.prepro.remove_pad_sequences(testY.tolist())
    validX = tl.prepro.remove_pad_sequences(validX.tolist())
    validY = tl.prepro.remove_pad_sequences(validY.tolist())
    return metadata, trainX, trainY, testX, testY, validX, validY
    
#data preprocessing
metadata, trainX, trainY, testX, testY, validX, validY = initial_setup()

In [12]:
# Parameters
src_len = len(trainX)
tgt_len = len(trainY)

assert src_len == tgt_len

In [13]:
batch_size = 32
n_step = src_len // batch_size
src_vocab_size = len(metadata['idx2w']) # 8002 (0~8001)
emb_dim = 1024

In [14]:
word2idx = metadata['w2idx']   # dict  word 2 index
idx2word = metadata['idx2w']   # list index 2 word
unk_id = word2idx['unk']   # 1
pad_id = word2idx['_']     # 0

In [15]:
start_id = src_vocab_size  # 8002
end_id = src_vocab_size + 1  # 8003

In [16]:
word2idx.update({'start_id': start_id})
word2idx.update({'end_id': end_id})
idx2word = idx2word + ['start_id', 'end_id']
src_vocab_size = tgt_vocab_size = src_vocab_size + 2
num_epochs = 1
vocabulary_size = src_vocab_size

In [17]:
 def inference(seed, top_n, model):
    model.eval()
    seed_id = [word2idx.get(w, unk_id) for w in seed.split(" ")]
    sentence_id = model(inputs=[[seed_id]], seq_length=20, start_token=start_id, top_n = top_n)
    sentence = []
    for w_id in sentence_id[0]:
        w = idx2word[w_id]
        if w == 'end_id':
            break
        sentence = sentence + [w]
    return sentence

In [18]:
decoder_seq_length = 20
model_ = Seq2seq(
        decoder_seq_length = decoder_seq_length,
        cell_enc=tf.keras.layers.GRUCell,
        cell_dec=tf.keras.layers.GRUCell,
        n_layer=3,
        n_units=256,
        embedding_layer=tl.layers.Embedding(vocabulary_size=vocabulary_size, embedding_size=emb_dim),
        )

[TL] Embedding embedding_1: (8004, 1024)
[TL] RNN rnn_1: cell: GRUCell, n_units: 256
[TL] RNN rnn_2: cell: GRUCell, n_units: 256
[TL] RNN rnn_3: cell: GRUCell, n_units: 256
[TL] RNN rnn_4: cell: GRUCell, n_units: 256
[TL] RNN rnn_5: cell: GRUCell, n_units: 256
[TL] RNN rnn_6: cell: GRUCell, n_units: 256
[TL] Reshape reshape_1
[TL] Dense  dense_1: 8004 No Activation
[TL] Reshape reshape_2
[TL] Reshape reshape_3


In [19]:
optimizer = tf.optimizers.Adam(learning_rate=0.001)
model_.train()

In [20]:
def train_chatbot(trainX, trainY, model):
      for epoch in range(num_epochs):
        model.train()
        trainX, trainY = shuffle(trainX, trainY, random_state=0)
        total_loss, n_iter = 0, 0

        for X, Y in tqdm(tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=batch_size, shuffle=False), 
                        total=n_step, desc='Epoch[{}/{}]'.format(epoch + 1, num_epochs), leave=False):

            X = tl.prepro.pad_sequences(X)
            _target_seqs = tl.prepro.sequences_add_end_id(Y, end_id=end_id)
            _target_seqs = tl.prepro.pad_sequences(_target_seqs, maxlen=decoder_seq_length)
            _decode_seqs = tl.prepro.sequences_add_start_id(Y, start_id=start_id, remove_last=False)
            _decode_seqs = tl.prepro.pad_sequences(_decode_seqs, maxlen=decoder_seq_length)
            _target_mask = tl.prepro.sequences_get_mask(_target_seqs)

            with tf.GradientTape() as tape:
                ## compute outputs
                output = model(inputs = [X, _decode_seqs])
                
                output = tf.reshape(output, [-1, vocabulary_size])
                
                ## compute loss and update model
                loss = cross_entropy_seq_with_mask(logits=output, target_seqs=_target_seqs, input_mask=_target_mask)

                grad = tape.gradient(loss, model.all_weights)
                optimizer.apply_gradients(zip(grad, model.all_weights))
            
            total_loss += loss
            n_iter += 1

        # printing average loss after every epoch
        print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, num_epochs, total_loss / n_iter))
      return model

In [21]:
seeds = ["how are you?", "donald trump is terrible"]

In [24]:
model_ = train_chatbot(trainX, trainY, model_)

                                                                 

Epoch [1/1]: loss 5.5761




In [25]:
def save_model(model):
  tl.files.save_npz(model.all_weights, name=PATH + 'model.npz')

save_model(model_)

[TL] [*] Saving TL weights into drive/MyDrive/wvh_cl_portfolio_chatbot/model.npz


  return array(a, dtype, copy=False, order=order, subok=True)


[TL] [*] Saved
