# Build a Chatbot

### Use bidirectional LSTM and attention mechanism 
### Dataset: [Movie Dialogue Corpus](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html)


In [46]:
import re
import time
import numpy as np
import pandas as pd
import tensorflow as tf

In [47]:
# Load data
movie_lines = open('dataset/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
movie_conversations = open('dataset/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

In [48]:
movie_lines[:2]

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!']

In [49]:
movie_conversations[:2]

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']"]

In [50]:
import os
def generate_quesAns(file_dir): 
    '''create a list of questions (inputs) and answers (targets) from data'''
    
    movie_dir = os.path.join(file_dir, 'movie_lines.txt')
    convs_dir = os.path.join(file_dir, 'movie_conversations.txt' )
    movie_lines = open(movie_dir, encoding='utf-8', errors='ignore').read().split('\n')
    movie_conversations = open(convs_dir, encoding='utf-8', errors='ignore').read().split('\n')
    
    id_line = {}
    convs_ids = [ ]
    questions = []
    answers = []
    
    # a dictionary mapping line_ids and its corresponding text
    for line in movie_lines:
        txt = line.split(' +++$+++ ')
        if len(txt) == 5:
            id_line[txt[0]] = txt[4]
    
    # check the id_line dict
    dict_pairs = id_line.items()
    pairs_iterator = iter(dict_pairs)
    first_pair = next(pairs_iterator)
    print(f'first key_value of id_line dictionary: {first_pair}')
    
    
    
    # a list containing all the conversation line_ids
    for line in movie_conversations[:-1]:
        ids = line.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
        convs_ids.append(ids.split(','))
        
    # check the convs_ids
    print(f'the first two line of convs_ids: {convs_ids[:1]}')
    
    
    
    # create questions and answers given the list of convs_ids and the sentence corresponding to each id
    for conv_id in convs_ids:
        for i in range(len(conv_id)-1):
            questions.append(id_line[conv_id[i]])
            answers.append(id_line[conv_id[i+1]])
            
    return questions, answers
    

In [51]:
base_dir = './dataset'
questions, answers = generate_quesAns(base_dir)

first key_value of id_line dictionary: ('L1045', 'They do not!')
the first two line of convs_ids: [['L194', 'L195', 'L196', 'L197']]


In [52]:
print(f'len(questions): {len(questions)} & len(answers): {len(answers)}\n')

for i in range(2):
    print(f'question{i}: {questions[i]}')
    print(f'answer{i}: {answers[i]}\n')

len(questions): 221616 & len(answers): 221616

question0: Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
answer0: Well, I thought we'd start with pronunciation, if that's okay with you.

question1: Well, I thought we'd start with pronunciation, if that's okay with you.
answer1: Not the hacking and gagging and spitting part.  Please.



In [9]:
from pycontractions import Contractions
import gensim.downloader as api
# Choose model accordingly for contractions function
model = api.load("glove-twitter-25")
# model = api.load("glove-twitter-100")
# model = api.load("word2vec-google-news-300")
cont = Contractions(kv_model=model)
cont.load_models()
def clean_data(text):
    """expand shortened words, e.g. don't to do not"""
    text = list(cont.expand_texts([text], precise=True))[0]
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    return text

In [54]:
questions = [clean_data(ques) for ques in questions]

In [55]:
answers = [clean_data(ans) for ans in answers]

In [56]:
for i in range(2):
    print(f'question{i}: {questions[i]}')
    print(f'answer{i}: {answers[i]}\n')

question0: can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again
answer0: well i thought we would start with pronunciation if that is okay with you

question1: well i thought we would start with pronunciation if that is okay with you
answer1: not the hacking and gagging and spitting part  please



In [57]:
#only keep questions and answers within min and max # of words, and remove those shorter than min_words and 
#longer than max_words'''
temp_questions = []
temp_answers = []
short_questions = []
short_answers = []

min_words = 2
max_words = 20

i = 0
j = 0
#remove short/long questions
for question in questions:  
    length = len(question.split())
    if  length>= min_words and length <= max_words:
        temp_questions.append(question)
        temp_answers.append(answers[i])
    i += 1

# remove short/long answers
for answer in temp_answers:
    length = len(answer.split())
    if  length>= min_words and length <= max_words:
        short_answers.append(answer)
        short_questions.append(temp_questions[j])
    j += 1


In [58]:
print(f'{len(short_questions)} out of {len(questions)} questions used')
print(f'{len(short_answers)} out of {len(answers)} questions used')
print(f'{100*round(len(short_questions)/len(questions), 3)} % of data used')

138335 out of 221616 questions used
138335 out of 221616 questions used
62.4 % of data used


In [59]:
def word_count(threshold, questions, answers):
    '''create a vocabulary dictionary representing frequency of each word in corpus'''
    '''then, remove words with counts less than threshold from vocabulary '''
    '''then, map each word in vocabulary to an integer'''
    
    tokens = ['<PAD>','<UNK>','<GO>', '<EOS>']
    vocabulary = {}
    vocab2int = {}
        
    for question in questions:
        for word in question.split():
            vocabulary[word] = vocabulary.get(word, 0) + 1
                
    for answer in answers:
        for word in answer.split():
            vocabulary[word] = vocabulary.get(word, 0) + 1
        answer += ' <EOS>'  #Add EOS token to the end of answer
    
    
    num = 0
    for key, value in vocabulary.items():
        if value >= threshold:
            vocab2int[key] = num
            num += 1
    
    for tok in tokens:
        vocab2int[tok] = len(vocab2int)+1
        
    
    return vocab2int

In [60]:
vocab2int = word_count(10, short_questions, short_answers)
print(f'length of vocab2int dictionary:{len(vocab2int)}')

length of vocab2int dictionary:8096


In [61]:
int2vocab = {value:key for key,value in vocab2int.items()}
print(f'length of int2vocab dictionary:{len(int2vocab)}')

length of int2vocab dictionary:8096


In [62]:
#change words to ints, if word not in vocabulary, consider it as '<UNK>' word
questions2ints, answers2ints = [], []
for question in short_questions:
    ints = [vocab2int[word] if word in vocab2int else vocab2int['<UNK>'] for word in question.split()]    
    questions2ints.append(ints)
    

for answer in short_answers:
    ints = [vocab2int[word] if word in vocab2int else vocab2int['<UNK>'] for word in answer.split()]  
    answers2ints.append(ints)
    
#to reduce the amount of padding during training, & as a result speed up training and reduce the loss, I will
# sort the questions and answers by the length of questions
sorted_questions, sorted_answers = [], []
for i in range(max_words):
    length = i+1
    for idx,val in enumerate(questions2ints):
        if len(val) == length:
            sorted_questions.append(questions2ints[idx])
            sorted_answers.append(answers2ints[idx])

In [63]:
print(f'the length of questions2ints: {len(questions2ints)}, the length of sorted_questions: {len(sorted_questions)}')
print(f'the length of questions2ints: {len(answers2ints)}, the length of sorted_answers: {len(sorted_answers)}')

the length of questions2ints: 138335, the length of sorted_questions: 138335
the length of questions2ints: 138335, the length of sorted_answers: 138335


## Training

In [64]:
split_point = int(len(questions2ints)*0.15)   #train_validation split point

train_questions = sorted_questions[split_point:]
train_answers = sorted_answers[split_point:]

valid_questions = sorted_questions[:split_point]
valid_answers = sorted_answers[:split_point]

print(f'the length of training data set: {len(train_questions)}')
print(f'the length of validation data set: {len(valid_questions)}')

the length of training data set: 117585
the length of validation data set: 20750


In [65]:
def padding(sentences, vocab2int):
    "makes the length of all sentence in the batch, the same"
    max_length = max([len(sentence) for sentence in sentences])
    return [sentence + [vocab2int['<PAD>']] * (max_length - len(sentence)) for sentence in sentences]

In [66]:
def sample(questions, answers, batch_size):
    '''sample a batch of questions and answers from training data'''
    for i in range(len(questions)//batch_size):
        q_batch = questions[i*batch_size : (i+1)*batch_size]
        q_batch2 = padding(q_batch, vocab2int)
        
        a_batch = answers[i*batch_size : (i+1)*batch_size]      
        a_batch2 = padding(a_batch, vocab2int)
        
        yield np.array(q_batch2), np.array((a_batch2))

In [25]:
from model import LSTM_MODEL


lr_decay = 0.9
min_lr = 0.0001
early_stop = 0 
stop = 5 
training_loss = 0 
validation_loss = [] 


model = LSTM_MODEL(max_words, vocab2int)


epoch = 1
epochs = 100
while epoch <= epochs:
    for batch, (questions, answers) in enumerate(sample(train_questions, train_answers, model.batch_size)):
        start_t = time.time()
        loss = model.fit(questions, answers)
        training_loss += loss
        end_t = time.time()
        delta_t = end_t - start_t
        
        if batch % 100 == 0:  #every 100 batch print the training loss                            
            print('Epoch {}/{}} Batch {}/{} - Loss: {:6.3f}, Seconds: {:6.2f}'.format(epoch,epochs, batch, 
                          len(train_questions) // model.batch_size, training_loss / 100, 100*delta_t))
                
            training_loss = 0

        if batch % (((len(train_questions))//model.batch_size//2)-1) == 0:
            valid_loss = 0
            start_tt = time.time()
            for batch_i, (questions, answers) in enumerate(sample(valid_questions, valid_answers, model.batch_size)):
                loss = model.sess.run(model.cost, {model.input_data: questions,
                                                   model.targets: answers,
                                                   model.lr: model.learning_rate,
                                                   model.sequence_length: answers.shape[1],
                                                   model.keep_prob: 1})
                valid_loss += loss
            end_tt = time.time()
            delta_tt = end_tt - start_tt
            avg_loss = valid_loss / (len(valid_questions) / model.batch_size)
            validation_loss.append(avg_loss)
                  
            print('Validation Loss: {:6.3f}, Seconds: {:6.2f}'.format(avg_loss, delta_tt))
            
            model.learning_rate *= lr_decay
            model.learning_rate = min_lr if model.learning_rate < min_lr else model.learning_rate

            
            if avg_loss <= min(validation_loss):
                print('Better model found!') 
                early_stop = 0
                model.save()
            else:
                print("No Improvement.")
                early_stop += 1
                if early_stop == stop:
                    break
        
    if early_stop == stop:
        print("Early Stopping!")
        break
                  
    epoch += 1

Epoch   1/100 Batch    0/918 - Loss:  0.091, Seconds: 192.23
Epoch   1/100 Batch  100/918 - Loss:  3.276, Seconds: 207.23
Epoch   1/100 Batch  200/918 - Loss:  2.404, Seconds: 223.50
Epoch   1/100 Batch  300/918 - Loss:  2.323, Seconds: 221.14
Epoch   1/100 Batch  400/918 - Loss:  2.240, Seconds: 224.46
Validation Loss:  2.178, Seconds: 102.58
Better model found!
Epoch   1/100 Batch  500/918 - Loss:  2.203, Seconds: 231.30
Epoch   1/100 Batch  600/918 - Loss:  2.217, Seconds: 254.84
Epoch   1/100 Batch  700/918 - Loss:  2.197, Seconds: 298.08
Epoch   1/100 Batch  800/918 - Loss:  2.179, Seconds: 323.92
Epoch   1/100 Batch  900/918 - Loss:  2.137, Seconds: 354.85
Validation Loss:  2.118, Seconds: 114.63
Better model found!
Epoch   2/100 Batch    0/918 - Loss:  0.382, Seconds: 223.83
Epoch   2/100 Batch  100/918 - Loss:  2.070, Seconds: 224.46
Epoch   2/100 Batch  200/918 - Loss:  2.059, Seconds: 242.57
Epoch   2/100 Batch  300/918 - Loss:  2.066, Seconds: 253.19
Epoch   2/100 Batch  400