# Build a Chatbot

### Use bidirectional LSTM and attention mechanism 
### Dataset: [Movie Dialogue Corpus](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html)

In [44]:
import re
import time
import numpy as np
import pandas as pd
import tensorflow as tf


In [2]:
# Load data
movie_lines = open('dataset/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
movie_conversations = open('dataset/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

In [3]:
movie_lines[:2]

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!']

In [4]:
movie_conversations[:2]

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']"]

In [5]:
import os
def generate_quesAns(file_dir): 
    '''create a list of questions (inputs) and answers (targets) from data'''
    
    movie_dir = os.path.join(file_dir, 'movie_lines.txt')
    convs_dir = os.path.join(file_dir, 'movie_conversations.txt' )
    movie_lines = open(movie_dir, encoding='utf-8', errors='ignore').read().split('\n')
    movie_conversations = open(convs_dir, encoding='utf-8', errors='ignore').read().split('\n')
    
    id_line = {}
    convs_ids = [ ]
    questions = []
    answers = []
    
    # a dictionary mapping line_ids and its corresponding text
    for line in movie_lines:
        txt = line.split(' +++$+++ ')
        if len(txt) == 5:
            id_line[txt[0]] = txt[4]
    
    # check the id_line dict
    dict_pairs = id_line.items()
    pairs_iterator = iter(dict_pairs)
    first_pair = next(pairs_iterator)
    print(f'first key_value of id_line dictionary: {first_pair}')
    
    
    
    # a list containing all the conversation line_ids
    for line in movie_conversations[:-1]:
        ids = line.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
        convs_ids.append(ids.split(','))
        
    # check the convs_ids
    print(f'the first two line of convs_ids: {convs_ids[:1]}')
    
    
    
    # create questions and answers given the list of convs_ids and the sentence corresponding to each id
    for conv_id in convs_ids:
        for i in range(len(conv_id)-1):
            questions.append(id_line[conv_id[i]])
            answers.append(id_line[conv_id[i+1]])
            
    return questions, answers
    

In [6]:
base_dir = './dataset'
questions, answers = generate_quesAns(base_dir)

first key_value of id_line dictionary: ('L1045', 'They do not!')
the first two line of convs_ids: [['L194', 'L195', 'L196', 'L197']]


In [7]:
print(f'len(questions): {len(questions)} & len(answers): {len(answers)}\n')

for i in range(2):
    print(f'question{i}: {questions[i]}')
    print(f'answer{i}: {answers[i]}\n')

len(questions): 221616 & len(answers): 221616

question0: Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
answer0: Well, I thought we'd start with pronunciation, if that's okay with you.

question1: Well, I thought we'd start with pronunciation, if that's okay with you.
answer1: Not the hacking and gagging and spitting part.  Please.



In [None]:
# ! sudo apt install openjdk-8-jdk
# ! sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
# ! pip install language-check
# ! pip install pycontractions

In [None]:
from pycontractions import Contractions
import gensim.downloader as api
# Choose model accordingly for contractions function
model = api.load("glove-twitter-25")
# model = api.load("glove-twitter-100")
# model = api.load("word2vec-google-news-300")
cont = Contractions(kv_model=model)
cont.load_models()
def clean_data_1(text):
    """expand shortened words, e.g. don't to do not"""
    text = list(cont.expand_texts([text], precise=True))[0]
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    return text

In [None]:
questions = [clean_data_1(ques) for ques in questions]

In [None]:
answers = [clean_data_1(ans) for ans in answers]

In [10]:
for i in range(2):
    print(f'question{i}: {questions[i]}')
    print(f'answer{i}: {answers[i]}\n')

question0: can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again
answer0: well i thought we would start with pronunciation if that is okay with you

question1: well i thought we would start with pronunciation if that is okay with you
answer1: not the hacking and gagging and spitting part  please



Let's check the data to be sure that it has been correctly formated

In [13]:
#only keep questions and answers within min and max # of words, and remove those shorter than min_words and 
#longer than max_words'''
temp_questions = []
temp_answers = []
short_questions = []
short_answers = []

min_words = 2
max_words = 20

i = 0
j = 0
#remove short/long questions
for question in questions:  
    length = len(question.split())
    if  length>= min_words and length <= max_words:
        temp_questions.append(question)
        temp_answers.append(answers[i])
    i += 1

# remove short/long answers
for answer in temp_answers:
    length = len(answer.split())
    if  length>= min_words and length <= max_words:
        short_answers.append(answer)
        short_questions.append(temp_questions[j])
    j += 1


In [19]:
print(f'{len(short_questions)} out of {len(questions)} questions used')
print(f'{len(short_answers)} out of {len(answers)} questions used')
print(f'{100*round(len(short_questions)/len(questions), 3)} % of data used')

138335 out of 221616 questions used
138335 out of 221616 questions used
62.4 % of data used


In [40]:
def word_count(threshold, questions, answers):
    '''create a vocabulary dictionary representing frequency of each word in corpus'''
    '''then, remove words with counts less than threshold from vocabulary '''
    '''then, map each word in vocabulary to an integer'''
    
    tokens = ['<PAD>','<UNK>','<GO>', '<EOS>']
    vocabulary = {}
    vocab2int = {}
        
    for question in questions:
        for word in question.split():
            vocabulary[word] = vocabulary.get(word, 0) + 1
                
    for answer in answers:
        for word in answer.split():
            vocabulary[word] = vocabulary.get(word, 0) + 1
        answer += ' <EOS>'  #Add EOS token to the end of answer
    
    
    num = 0
    for key, value in vocabulary.items():
        if value >= threshold:
            vocab2int[key] = num
            num += 1
    
    for tok in tokens:
        vocab2int[tok] = len(vocab2int)+1
        
    
    return vocab2int