**Declaration**: Most code of this work is from https://github.com/tensorlayer/seq2seq-chatbot. I changed some of data processing code and modified the model in order to make it work on my laptop. An important goal of this final project is to compare the impacts of pretrained embedding and trained-from-scratch embedding to the results. My work is mainly focused on introducing the GloVe embedding matrix, in which I have to modify the code from data prepratation and totally rewrite the embedding layer part of the model. 

In [3]:
import random
import nltk
import itertools
from collections import defaultdict
import numpy as np
import pickle

In [4]:
EN_WHITELIST = '0123456789abcdefghijklmnopqrstuvwxyz ' # space is included in whitelist
EN_BLACKLIST = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\''
limit = {'maxq' : 25, 'minq' : 2, 'maxa' : 25, 'mina' : 2}
UNK = 'unk'
VOCAB_SIZE = 8000

# Data Preparation

In [5]:
def get_id2line():
    lines=open('/home/ubuntu/nlp_data/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
    id2line = {}
    for line in lines:
        _line = line.split(' +++$+++ ')
        if len(_line) == 5:
            id2line[_line[0]] = _line[4]
    return id2line

def get_conversations():
    conv_lines = open('/home/ubuntu/nlp_data/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')
    convs = [ ]
    for line in conv_lines[:-1]:
        _line = line.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
        convs.append(_line.split(','))
    return convs

def extract_conversations(convs,id2line,path=''):
    idx = 0
    for conv in convs:
        f_conv = open(path + str(idx)+'.txt', 'w')
        for line_id in conv:
            f_conv.write(id2line[line_id])
            f_conv.write('\n')
        f_conv.close()
        idx += 1
        
def gather_dataset(convs, id2line):
    questions = []; answers = []
    for conv in convs:
        if len(conv) %2 != 0:
            conv = conv[:-1]
        for i in range(len(conv)):
            if i%2 == 0:
                questions.append(id2line[conv[i]])
            else:
                answers.append(id2line[conv[i]])

    return questions, answers

def prepare_seq2seq_files(questions, answers, path='',TESTSET_SIZE = 30000):

    # open files
    train_enc = open(path + 'train.enc','w')
    train_dec = open(path + 'train.dec','w')
    test_enc  = open(path + 'test.enc', 'w')
    test_dec  = open(path + 'test.dec', 'w')

    # choose 30,000 (TESTSET_SIZE) items to put into testset
    test_ids = random.sample([i for i in range(len(questions))],TESTSET_SIZE)

    for i in range(len(questions)):
        if i in test_ids:
            test_enc.write(questions[i]+'\n')
            test_dec.write(answers[i]+ '\n' )
        else:
            train_enc.write(questions[i]+'\n')
            train_dec.write(answers[i]+ '\n' )
        if i%10000 == 0:
            print('\n>> written {} lines'.format(i))

    # close files
    train_enc.close()
    train_dec.close()
    test_enc.close()
    test_dec.close()


def filter_line(line, whitelist):
    return ''.join([ ch for ch in line if ch in whitelist ])

def filter_data(qseq, aseq):
    filtered_q, filtered_a = [], []
    raw_data_len = len(qseq)

    assert len(qseq) == len(aseq)

    for i in range(raw_data_len):
        qlen, alen = len(qseq[i].split(' ')), len(aseq[i].split(' '))
        if qlen >= limit['minq'] and qlen <= limit['maxq']:
            if alen >= limit['mina'] and alen <= limit['maxa']:
                filtered_q.append(qseq[i])
                filtered_a.append(aseq[i])

    # print the fraction of the original data, filtered
    filt_data_len = len(filtered_q)
    filtered = int((raw_data_len - filt_data_len)*100/raw_data_len)
    print(str(filtered) + '% filtered from original data')

    return filtered_q, filtered_a

'''
 read list of words, create index to word,
  word to index dictionaries
    return tuple( vocab->(word, count), idx2w, w2idx )
'''
def index_(tokenized_sentences, vocab_size):
    # get frequency distribution
    freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    # get vocabulary of 'vocab_size' most used words
    vocab = freq_dist.most_common(vocab_size)
    # index2word
    index2word = ['_'] + [UNK] + [ x[0] for x in vocab ]
    # word2index
    word2index = dict([(w,i) for i,w in enumerate(index2word)] )
    return index2word, word2index, freq_dist

'''
 filter based on number of unknowns (words not in vocabulary)
  filter out the worst sentences
'''
def filter_unk(qtokenized, atokenized, w2idx):
    data_len = len(qtokenized)

    filtered_q, filtered_a = [], []

    for qline, aline in zip(qtokenized, atokenized):
        unk_count_q = len([ w for w in qline if w not in w2idx ])
        unk_count_a = len([ w for w in aline if w not in w2idx ])
        if unk_count_a <= 2:
            if unk_count_q > 0:
                if unk_count_q/len(qline) > 0.2:
                    pass
            filtered_q.append(qline)
            filtered_a.append(aline)

    # print the fraction of the original data, filtered
    filt_data_len = len(filtered_q)
    filtered = int((data_len - filt_data_len)*100/data_len)
    print(str(filtered) + '% filtered from original data')

    return filtered_q, filtered_a




'''
 create the final dataset :
  - convert list of items to arrays of indices
  - add zero padding
      return ( [array_en([indices]), array_ta([indices]) )
'''
def zero_pad(qtokenized, atokenized, w2idx):
    # num of rows
    data_len = len(qtokenized)

    # numpy arrays to store indices
    idx_q = np.zeros([data_len, limit['maxq']], dtype=np.int32)
    idx_a = np.zeros([data_len, limit['maxa']], dtype=np.int32)

    for i in range(data_len):
        q_indices = pad_seq(qtokenized[i], w2idx, limit['maxq'])
        a_indices = pad_seq(atokenized[i], w2idx, limit['maxa'])

        #print(len(idx_q[i]), len(q_indices))
        #print(len(idx_a[i]), len(a_indices))
        idx_q[i] = np.array(q_indices)
        idx_a[i] = np.array(a_indices)

    return idx_q, idx_a


'''
 replace words with indices in a sequence
  replace with unknown if word not in lookup
    return [list of indices]
'''
def pad_seq(seq, lookup, maxlen):
    indices = []
    for word in seq:
        if word in lookup:
            indices.append(lookup[word])
        else:
            indices.append(lookup[UNK])
    return indices + [0]*(maxlen - len(seq))





def process_data():

    id2line = get_id2line()
    print('>> gathered id2line dictionary.\n')
    convs = get_conversations()
    print(convs[121:125])
    print('>> gathered conversations.\n')
    questions, answers = gather_dataset(convs,id2line)

    # change to lower case (just for en)
    questions = [ line.lower() for line in questions ]
    answers = [ line.lower() for line in answers ]

    # filter out unnecessary characters
    print('\n>> Filter lines')
    questions = [ filter_line(line, EN_WHITELIST) for line in questions ]
    answers = [ filter_line(line, EN_WHITELIST) for line in answers ]

    # filter out too long or too short sequences
    print('\n>> 2nd layer of filtering')
    qlines, alines = filter_data(questions, answers)

    for q,a in zip(qlines[141:145], alines[141:145]):
        print('q : [{0}]; a : [{1}]'.format(q,a))

    # convert list of [lines of text] into list of [list of words ]
    print('\n>> Segment lines into words')
    qtokenized = [ [w.strip() for w in wordlist.split(' ') if w] for wordlist in qlines ]
    atokenized = [ [w.strip() for w in wordlist.split(' ') if w] for wordlist in alines ]
    print('\n:: Sample from segmented list of words')

    for q,a in zip(qtokenized[141:145], atokenized[141:145]):
        print('q : [{0}]; a : [{1}]'.format(q,a))

    # indexing -> idx2w, w2idx
    print('\n >> Index words')
    idx2w, w2idx, freq_dist = index_( qtokenized + atokenized, vocab_size=VOCAB_SIZE)

    # filter out sentences with too many unknowns
    print('\n >> Filter Unknowns')
    qtokenized, atokenized = filter_unk(qtokenized, atokenized, w2idx)
    print('\n Final dataset len : ' + str(len(qtokenized)))


    print('\n >> Zero Padding')
    idx_q, idx_a = zero_pad(qtokenized, atokenized, w2idx)

    print('\n >> Save numpy arrays to disk')
    # save them
    np.save('idx_q.npy', idx_q)
    np.save('idx_a.npy', idx_a)

    # let us now save the necessary dictionaries
    metadata = {
            'w2idx' : w2idx,
            'idx2w' : idx2w,
            'limit' : limit,
            'freq_dist' : freq_dist
                }

    # write to disk : data control dictionaries
    with open('metadata.pkl', 'wb') as f:
        pickle.dump(metadata, f)

    # count of unknowns
    unk_count = (idx_q == 1).sum() + (idx_a == 1).sum()
    # count of words
    word_count = (idx_q > 1).sum() + (idx_a > 1).sum()

    print('% unknown : {0}'.format(100 * (unk_count/word_count)))
    print('Dataset count : ' + str(idx_q.shape[0]))


    #print '>> gathered questions and answers.\n'
    #prepare_seq2seq_files(questions,answers)


import numpy as np
from random import sample

'''
 split data into train (70%), test (15%) and valid(15%)
    return tuple( (trainX, trainY), (testX,testY), (validX,validY) )
'''
def split_dataset(x, y, ratio = [0.7, 0.15, 0.15] ):
    # number of examples
    data_len = len(x)
    lens = [ int(data_len*item) for item in ratio ]

    trainX, trainY = x[:lens[0]], y[:lens[0]]
    testX, testY = x[lens[0]:lens[0]+lens[1]], y[lens[0]:lens[0]+lens[1]]
    validX, validY = x[-lens[-1]:], y[-lens[-1]:]

    return (trainX,trainY), (testX,testY), (validX,validY)


'''
 generate batches from dataset
    yield (x_gen, y_gen)
    TODO : fix needed
'''
def batch_gen(x, y, batch_size):
    # infinite while
    while True:
        for i in range(0, len(x), batch_size):
            if (i+1)*batch_size < len(x):
                yield x[i : (i+1)*batch_size ].T, y[i : (i+1)*batch_size ].T

'''
 generate batches, by random sampling a bunch of items
    yield (x_gen, y_gen)
'''
def rand_batch_gen(x, y, batch_size):
    while True:
        sample_idx = sample(list(np.arange(len(x))), batch_size)
        yield x[sample_idx].T, y[sample_idx].T

        
def decode(sequence, lookup, separator=''): # 0 used for padding, is ignored
    return separator.join([ lookup[element] for element in sequence if element ])




def load_data(PATH=''):
    # read data control dictionaries
    with open(PATH + 'metadata.pkl', 'rb') as f:
        metadata = pickle.load(f)
    # read numpy arrays
    idx_q = np.load(PATH + 'idx_q.npy')
    idx_a = np.load(PATH + 'idx_a.npy')
    return metadata, idx_q, idx_a

In [6]:
process_data()

>> gathered id2line dictionary.

[['L447', 'L448'], ['L490', 'L491'], ['L716', 'L717', 'L718', 'L719', 'L720', 'L721'], ['L750', 'L751', 'L752', 'L753', 'L754', 'L755']]
>> gathered conversations.


>> Filter lines

>> 2nd layer of filtering
28% filtered from original data
q : [you hate me dont you]; a : [i dont really think you warrant that strong an emotion]
q : [then say youll spend dollar night at the track with me]; a : [and why would i do that]
q : [come on  the ponies the flat beer you with money in your eyes me with my hand on your ass]; a : [you  covered in my vomit]
q : [are you following me]; a : [i was in the laundromat i saw your car thought id say hi]

>> Segment lines into words

:: Sample from segmented list of words
q : [['you', 'hate', 'me', 'dont', 'you']]; a : [['i', 'dont', 'really', 'think', 'you', 'warrant', 'that', 'strong', 'an', 'emotion']]
q : [['then', 'say', 'youll', 'spend', 'dollar', 'night', 'at', 'the', 'track', 'with', 'me']]; a : [['and', 'why', 'woul

# Seq2seq Model

In [14]:
import tensorflow as tf
import tensorlayer as tl
from tensorlayer.layers import *

import tensorflow as tf
import numpy as np
import time

###============= prepare data
# from data.twitter import data
# metadata, idx_q, idx_a = data.load_data(PATH='data/twitter/')                   # Twitter
# from data.cornell_corpus import data
metadata, idx_q, idx_a = load_data(PATH='./')          # Cornell Moive
(trainX, trainY), (testX, testY), (validX, validY) = split_dataset(idx_q, idx_a)

trainX = trainX.tolist()
trainY = trainY.tolist()
testX = testX.tolist()
testY = testY.tolist()
validX = validX.tolist()
validY = validY.tolist()

trainX = tl.prepro.remove_pad_sequences(trainX)
trainY = tl.prepro.remove_pad_sequences(trainY)
testX = tl.prepro.remove_pad_sequences(testX)
testY = tl.prepro.remove_pad_sequences(testY)
validX = tl.prepro.remove_pad_sequences(validX)
validY = tl.prepro.remove_pad_sequences(validY)

###============= parameters
xseq_len = len(trainX)#.shape[-1]
yseq_len = len(trainY)#.shape[-1]
assert xseq_len == yseq_len
batch_size = 32
n_step = int(xseq_len/batch_size)
xvocab_size = len(metadata['idx2w']) # 8002 (0~8001)
emb_dim = 1024

w2idx = metadata['w2idx']   # dict  word 2 index
idx2w = metadata['idx2w']   # list index 2 word

unk_id = w2idx['unk']   # 1
pad_id = w2idx['_']     # 0

start_id = xvocab_size  # 8002
end_id = xvocab_size+1  # 8003

w2idx.update({'start_id': start_id})
w2idx.update({'end_id': end_id})
idx2w = idx2w + ['start_id', 'end_id']

xvocab_size = yvocab_size = xvocab_size + 2

""" A data for Seq2Seq should look like this:
input_seqs : ['how', 'are', 'you', '<PAD_ID'>]
decode_seqs : ['<START_ID>', 'I', 'am', 'fine', '<PAD_ID'>]
target_seqs : ['I', 'am', 'fine', '<END_ID>', '<PAD_ID'>]
target_mask : [1, 1, 1, 1, 0]
"""

print("encode_seqs", [idx2w[id] for id in trainX[10]])
target_seqs = tl.prepro.sequences_add_end_id([trainY[10]], end_id=end_id)[0]
    # target_seqs = tl.prepro.remove_pad_sequences([target_seqs], pad_id=pad_id)[0]
print("target_seqs", [idx2w[id] for id in target_seqs])
decode_seqs = tl.prepro.sequences_add_start_id([trainY[10]], start_id=start_id, remove_last=False)[0]
    # decode_seqs = tl.prepro.remove_pad_sequences([decode_seqs], pad_id=pad_id)[0]
print("decode_seqs", [idx2w[id] for id in decode_seqs])
target_mask = tl.prepro.sequences_get_mask([target_seqs])[0]
print("target_mask", target_mask)
print(len(target_seqs), len(decode_seqs), len(target_mask))

encode_seqs ['you', 'know', 'unk']
target_seqs ['i', 'believe', 'we', 'share', 'an', 'art', 'unk', 'end_id']
decode_seqs ['start_id', 'i', 'believe', 'we', 'share', 'an', 'art', 'unk']
target_mask [1 1 1 1 1 1 1 1]
8 8 8


In [15]:
###============= model
def model(encode_seqs, decode_seqs, is_train=True, reuse=False):
    with tf.variable_scope("model", reuse=reuse):
        # for chatbot, you can use the same embedding layer,
        # for translation, you may want to use 2 seperated embedding layers
        with tf.variable_scope("embedding") as vs:
            net_encode = EmbeddingInputlayer(
                inputs = encode_seqs,
                vocabulary_size = xvocab_size,
                embedding_size = emb_dim,
                name = 'seq_embedding')
            vs.reuse_variables()
            tl.layers.set_name_reuse(True) # remove if TL version == 1.8.0+
            net_decode = EmbeddingInputlayer(
                inputs = decode_seqs,
                vocabulary_size = xvocab_size,
                embedding_size = emb_dim,
                name = 'seq_embedding')
        net_rnn = Seq2Seq(net_encode, net_decode,
                cell_fn = tf.contrib.rnn.BasicLSTMCell,
                n_hidden = emb_dim,
                initializer = tf.random_uniform_initializer(-0.1, 0.1),
                encode_sequence_length = retrieve_seq_length_op2(encode_seqs),
                decode_sequence_length = retrieve_seq_length_op2(decode_seqs),
                initial_state_encode = None,
                dropout = (0.5 if is_train else None),
                n_layer = 3,
                return_seq_2d = True,
                name = 'seq2seq')
        net_out = DenseLayer(net_rnn, n_units=xvocab_size, act=tf.identity, name='output')
    return net_out, net_rnn

# model for training
with tf.device('/device:GPU:0'):
    encode_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="encode_seqs")
    decode_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="decode_seqs")
    target_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="target_seqs")
    target_mask = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="target_mask") # tl.prepro.sequences_get_mask()
net_out, _ = model(encode_seqs, decode_seqs, is_train=True, reuse=False)

# model for inferencing
with tf.device('/device:GPU:0'):
    encode_seqs2 = tf.placeholder(dtype=tf.int64, shape=[1, None], name="encode_seqs")
    decode_seqs2 = tf.placeholder(dtype=tf.int64, shape=[1, None], name="decode_seqs")
net, net_rnn = model(encode_seqs2, decode_seqs2, is_train=False, reuse=True)
y = tf.nn.softmax(net.outputs)

loss = tl.cost.cross_entropy_seq_with_mask(logits=net_out.outputs, target_seqs=target_seqs, input_mask=target_mask, return_details=False, name='cost')

net_out.print_params(False)

lr = 0.0001
train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True))
tl.layers.initialize_global_variables(sess)
tl.files.load_and_assign_npz(sess=sess, name='n.npz', network=net)

###============= train
n_epoch = 50
for epoch in range(n_epoch):
    epoch_time = time.time()
    ## shuffle training data
    from sklearn.utils import shuffle
    trainX, trainY = shuffle(trainX, trainY, random_state=0)
    ## train an epoch
    total_err, n_iter = 0, 0
    for X, Y in tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=batch_size, shuffle=False):
        step_time = time.time()

        X = tl.prepro.pad_sequences(X)
        _target_seqs = tl.prepro.sequences_add_end_id(Y, end_id=end_id)
        _target_seqs = tl.prepro.pad_sequences(_target_seqs)

        _decode_seqs = tl.prepro.sequences_add_start_id(Y, start_id=start_id, remove_last=False)
        _decode_seqs = tl.prepro.pad_sequences(_decode_seqs)
        _target_mask = tl.prepro.sequences_get_mask(_target_seqs)

        _, err = sess.run([train_op, loss],
                        {encode_seqs: X,
                        decode_seqs: _decode_seqs,
                        target_seqs: _target_seqs,
                        target_mask: _target_mask})

        if n_iter % 200 == 0:
            print("Epoch[%d/%d] step:[%d/%d] loss:%f took:%.5fs" % (epoch, n_epoch, n_iter, n_step, err, time.time() - step_time))

        total_err += err; n_iter += 1

        ###============= inference
        if n_iter % 1000 == 0:
            seeds = ["happy birthday have a nice day",
                    "how was it going"]
            for seed in seeds:
                print("Query >", seed)
                seed_id = [w2idx[w] for w in seed.split(" ")]
                for _ in range(5):  # 1 Query --> 5 Reply
                    # 1. encode, get state
                    state = sess.run(net_rnn.final_state_encode,
                                    {encode_seqs2: [seed_id]})
                    # 2. decode, feed start_id, get first word
                    #   ref https://github.com/zsdonghao/tensorlayer/blob/master/example/tutorial_ptb_lstm_state_is_tuple.py
                    o, state = sess.run([y, net_rnn.final_state_decode],
                                    {net_rnn.initial_state_decode: state,
                                    decode_seqs2: [[start_id]]})
                    w_id = tl.nlp.sample_top(o[0], top_k=3)
                    w = idx2w[w_id]
                    # 3. decode, feed state iteratively
                    sentence = [w]
                    for _ in range(30): # max sentence length
                        o, state = sess.run([y, net_rnn.final_state_decode],
                                        {net_rnn.initial_state_decode: state,
                                        decode_seqs2: [[w_id]]})
                        w_id = tl.nlp.sample_top(o[0], top_k=2)
                        w = idx2w[w_id]
                        if w_id == end_id:
                            break
                        sentence = sentence + [w]
                    print(" >", ' '.join(sentence))

    print("Epoch[%d/%d] averaged loss:%f took:%.5fs" % (epoch, n_epoch, total_err/n_iter, time.time()-epoch_time))

    tl.files.save_npz(net.all_params, name='n.npz', sess=sess)

encode_seqs ['you', 'know', 'unk']
target_seqs ['i', 'believe', 'we', 'share', 'an', 'art', 'unk', 'end_id']
decode_seqs ['start_id', 'i', 'believe', 'we', 'share', 'an', 'art', 'unk']
target_mask [1 1 1 1 1 1 1 1]
8 8 8
[TL] EmbeddingInputlayer model/embedding/seq_embedding: (8004, 1024)
Instructions for updating: TensorLayer relies on TensorFlow to check name reusing

[TL] EmbeddingInputlayer model/embedding/seq_embedding: (8004, 1024)
[TL] [*] Seq2Seq model/seq2seq: n_hidden: 1024 cell_fn: BasicLSTMCell dropout: 0.5 n_layer: 3
[TL] DynamicRNNLayer model/seq2seq/encode: n_hidden: 1024, in_dim: 3 in_shape: (32, ?, 1024) cell_fn: BasicLSTMCell dropout: 0.5 n_layer: 3
[TL]        batch_size (concurrent processes): 32
[TL] DynamicRNNLayer model/seq2seq/decode: n_hidden: 1024, in_dim: 3 in_shape: (32, ?, 1024) cell_fn: BasicLSTMCell dropout: 0.5 n_layer: 3
[TL]        batch_size (concurrent processes): 32
[TL] DenseLayer  model/output: 8004 No Activation
[TL] EmbeddingInputlayer model/emb

Query > happy birthday have a nice day
 > you dont know that
 > i dont know
 > i know i was unk
 > you know i dont know i was a unk
 > i dont know
Query > how was it going
 > i know
 > i dont want to go
 > i dont know
 > i dont know
 > i dont know
Epoch[3/50] step:[2000/2110] loss:5.047546 took:0.44560s
Epoch[3/50] averaged loss:5.171223 took:946.28060s
[TL] [*] n.npz saved
Epoch[4/50] step:[0/2110] loss:4.750675 took:0.59983s
Epoch[4/50] step:[200/2110] loss:4.885523 took:0.47944s
Epoch[4/50] step:[400/2110] loss:5.154873 took:0.45554s
Epoch[4/50] step:[600/2110] loss:5.111124 took:0.47352s
Epoch[4/50] step:[800/2110] loss:4.935506 took:0.40880s
Query > happy birthday have a nice day
 > i know i know
 > no i dont know
 > yeah you know
 > i dont know
 > yeah i dont know
Query > how was it going
 > i dont know
 > i dont know
 > i dont know i dont know
 > i know
 > i dont know
Epoch[4/50] step:[1000/2110] loss:5.263610 took:0.41745s
Epoch[4/50] step:[1200/2110] loss:5.094879 took:0.48180

 > you know what i was
 > it was a good time
 > i dont know
 > you know what i mean
 > it was a unk unk and a unk
Epoch[10/50] step:[1000/2110] loss:4.881660 took:0.47982s
Epoch[10/50] step:[1200/2110] loss:4.535581 took:0.46348s
Epoch[10/50] step:[1400/2110] loss:4.938489 took:0.43730s
Epoch[10/50] step:[1600/2110] loss:4.856382 took:0.45817s
Epoch[10/50] step:[1800/2110] loss:4.792933 took:0.46189s
Query > happy birthday have a nice day
 > no no no no
 > no i just want to see you
 > i dont know
 > i dont know
 > i dont know what youre talking about
Query > how was it going
 > you know
 > i dont know i dont know
 > i dont know
 > you know i dont know
 > i know
Epoch[10/50] step:[2000/2110] loss:4.532131 took:0.41012s
Epoch[10/50] averaged loss:4.660116 took:946.55879s
[TL] [*] n.npz saved
Epoch[11/50] step:[0/2110] loss:4.822689 took:0.60816s
Epoch[11/50] step:[200/2110] loss:4.423930 took:0.41317s
Epoch[11/50] step:[400/2110] loss:4.539880 took:0.41891s
Epoch[11/50] step:[600/2110] l

Epoch[17/50] step:[200/2110] loss:4.529686 took:0.45110s
Epoch[17/50] step:[400/2110] loss:4.546711 took:0.38889s
Epoch[17/50] step:[600/2110] loss:4.140928 took:0.41571s
Epoch[17/50] step:[800/2110] loss:4.049947 took:0.43345s
Query > happy birthday have a nice day
 > its a unk
 > i know i dont know what youre talking about
 > i know i know
 > i know
 > i know
Query > how was it going
 > i dont know
 > i dont know
 > i dont know
 > it was a unk
 > i dont know i just dont know
Epoch[17/50] step:[1000/2110] loss:4.311115 took:0.44707s
Epoch[17/50] step:[1200/2110] loss:4.441726 took:0.43689s
Epoch[17/50] step:[1400/2110] loss:4.357040 took:0.44105s
Epoch[17/50] step:[1600/2110] loss:4.606152 took:0.41505s
Epoch[17/50] step:[1800/2110] loss:4.484820 took:0.43165s
Query > happy birthday have a nice day
 > i dont know
 > i dont know
 > thank you
 > i dont know
 > i dont know
Query > how was it going
 > it was a long time ago
 > oh i dont know
 > i dont know
 > it was a long time ago
 > oh 

 > i dont know
 > i dont know
Epoch[23/50] step:[2000/2110] loss:3.839619 took:0.42067s
Epoch[23/50] averaged loss:4.073029 took:951.77955s
[TL] [*] n.npz saved
Epoch[24/50] step:[0/2110] loss:3.808164 took:0.53888s
Epoch[24/50] step:[200/2110] loss:3.790416 took:0.44076s
Epoch[24/50] step:[400/2110] loss:4.030349 took:0.44266s
Epoch[24/50] step:[600/2110] loss:3.782204 took:0.46725s
Epoch[24/50] step:[800/2110] loss:4.198504 took:0.45040s
Query > happy birthday have a nice day
 > i dont think so
 > its a unk
 > thank you sir
 > thank you sir
 > its a good time
Query > how was it going
 > not good
 > i dont know
 > i dont know
 > i dont know
 > i dont know
Epoch[24/50] step:[1000/2110] loss:4.618804 took:0.45881s
Epoch[24/50] step:[1200/2110] loss:3.994356 took:0.44412s
Epoch[24/50] step:[1400/2110] loss:4.216904 took:0.42090s
Epoch[24/50] step:[1600/2110] loss:4.257865 took:0.46488s
Epoch[24/50] step:[1800/2110] loss:4.270931 took:0.44217s
Query > happy birthday have a nice day
 > its

Epoch[30/50] step:[1000/2110] loss:3.617737 took:0.41309s
Epoch[30/50] step:[1200/2110] loss:3.753987 took:0.47275s
Epoch[30/50] step:[1400/2110] loss:3.816749 took:0.44454s
Epoch[30/50] step:[1600/2110] loss:4.106244 took:0.45165s
Epoch[30/50] step:[1800/2110] loss:3.934877 took:0.44001s
Query > happy birthday have a nice day
 > thank you
 > thanks you mean
 > i know
 > thank you very much
 > thank you
Query > how was it going
 > it was unk i was unk
 > it was a long time
 > i dont know
 > it wasnt a long time ago
 > it was a long time ago
Epoch[30/50] step:[2000/2110] loss:3.959654 took:0.45703s
Epoch[30/50] averaged loss:3.749950 took:944.50406s
[TL] [*] n.npz saved
Epoch[31/50] step:[0/2110] loss:3.678099 took:0.53443s
Epoch[31/50] step:[200/2110] loss:3.694261 took:0.43996s
Epoch[31/50] step:[400/2110] loss:3.672427 took:0.46825s
Epoch[31/50] step:[600/2110] loss:3.616123 took:0.42846s
Epoch[31/50] step:[800/2110] loss:3.422000 took:0.45871s
Query > happy birthday have a nice day


 > just like a few minutes ago
 > it wasnt a long time
 > i dont know
Epoch[36/50] step:[2000/2110] loss:3.336208 took:0.42671s
Epoch[36/50] averaged loss:3.474585 took:944.01954s
[TL] [*] n.npz saved
Epoch[37/50] step:[0/2110] loss:3.512980 took:0.58408s
Epoch[37/50] step:[200/2110] loss:3.484130 took:0.47610s
Epoch[37/50] step:[400/2110] loss:3.520934 took:0.39605s
Epoch[37/50] step:[600/2110] loss:3.365673 took:0.46666s
Epoch[37/50] step:[800/2110] loss:3.429557 took:0.43754s
Query > happy birthday have a nice day
 > its not a long
 > its not really
 > thank you sir
 > thank you sir
 > its a good idea
Query > how was it going
 > not bad
 > it was the best time to see the unk
 > it was a long time ago
 > i dont know
 > it was a long time ago
Epoch[37/50] step:[1000/2110] loss:3.224291 took:0.45601s
Epoch[37/50] step:[1200/2110] loss:3.331098 took:0.42724s
Epoch[37/50] step:[1400/2110] loss:3.509528 took:0.47373s
Epoch[37/50] step:[1600/2110] loss:3.094489 took:0.47730s
Epoch[37/50] s

 > i was hoping you were in the bathroom
 > just a little unk and a half
 > just a little unk
 > it was a long time ago
Epoch[43/50] step:[1000/2110] loss:2.785696 took:0.38547s
Epoch[43/50] step:[1200/2110] loss:3.376303 took:0.45661s
Epoch[43/50] step:[1400/2110] loss:3.296626 took:0.46909s
Epoch[43/50] step:[1600/2110] loss:3.329867 took:0.45078s
Epoch[43/50] step:[1800/2110] loss:2.984496 took:0.34620s
Query > happy birthday have a nice day
 > thank you very much
 > thank you
 > its not really
 > its a good night
 > thank you very much
Query > how was it going
 > i dont know
 > it was a good time
 > it was a long time ago
 > i dont know
 > i dont know
Epoch[43/50] step:[2000/2110] loss:3.426354 took:0.46537s
Epoch[43/50] averaged loss:3.172437 took:945.47296s
[TL] [*] n.npz saved
Epoch[44/50] step:[0/2110] loss:3.156771 took:0.58777s
Epoch[44/50] step:[200/2110] loss:2.989133 took:0.43573s
Epoch[44/50] step:[400/2110] loss:3.441839 took:0.46443s
Epoch[44/50] step:[600/2110] loss:2.

 > i dont know
 > it was a long time ago
Epoch[49/50] step:[2000/2110] loss:2.651119 took:0.47361s
Epoch[49/50] averaged loss:2.932557 took:944.52380s
[TL] [*] n.npz saved
