# Chatbot!


🎥 [Video](https://www.udemy.com/course/nlp-natural-language-processing-with-python/learn/lecture/13258226#overview)
📄 [End-to-End Memory Networks : paper](https://arxiv.org/pdf/1503.08895.pdf)

In [1]:
import pickle
import numpy as np

In [2]:
with open('../../data/train_qa.txt', 'rb') as f:
    train_data = pickle.load(f)

In [3]:
with open('../../data/test_qa.txt', 'rb') as f:
    test_data = pickle.load(f) 

In [4]:
print(type(train_data))
print(type(test_data))

<class 'list'>
<class 'list'>


In [5]:
print(len(train_data))
print(len(test_data))

10000
1000


In [6]:
# see the 3 parts:
# - story
# - question
# - answer
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

In [7]:
' '.join(train_data[0][0])

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [8]:
' '.join(train_data[0][1])

'Is Sandra in the hallway ?'

In [9]:
train_data[0][2]

'no'

In [10]:
all_data = test_data + train_data

In [11]:
len(all_data)

11000

In [12]:
vocab = set()
vocab.add('no')
vocab.add('yes')
for story, question, answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))
    # vocab = vocab.union(set(answer))

vocab_len = len(vocab) + 1

In [13]:
vocab_len

38

In [14]:
# LONGEST STORY
all_story_lens = [len(data[0]) for data in all_data]
max_story_len = max(all_story_lens)

print(all_story_lens[:10])
print(max_story_len)

[12, 23, 35, 47, 59, 13, 26, 37, 50, 62]
156


In [15]:
# LONGEST QUESTION
all_question_lens = [len(data[1]) for data in all_data]
max_question_len = max(all_question_lens)

print(all_question_lens[:10])
print(max_question_len)

[6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
6


# Vectorize the data

In [16]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [17]:
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [18]:
print(tokenizer.word_index)

{'office': 1, 'grabbed': 2, 'bathroom': 3, 'is': 4, 'football': 5, 'down': 6, 'got': 7, 'john': 8, 'sandra': 9, 'in': 10, 'garden': 11, 'moved': 12, 'up': 13, 'hallway': 14, 'put': 15, 'discarded': 16, 'took': 17, 'kitchen': 18, 'picked': 19, 'yes': 20, 'went': 21, 'to': 22, 'apple': 23, 'left': 24, 'dropped': 25, 'there': 26, '?': 27, 'mary': 28, 'back': 29, 'travelled': 30, 'journeyed': 31, 'milk': 32, 'no': 33, '.': 34, 'daniel': 35, 'the': 36, 'bedroom': 37}


In [19]:
print(tokenizer.word_counts)

OrderedDict([('office', 1), ('grabbed', 1), ('bathroom', 1), ('is', 1), ('football', 1), ('down', 1), ('got', 1), ('john', 1), ('sandra', 1), ('in', 1), ('garden', 1), ('moved', 1), ('up', 1), ('hallway', 1), ('put', 1), ('discarded', 1), ('took', 1), ('kitchen', 1), ('picked', 1), ('yes', 1), ('went', 1), ('to', 1), ('apple', 1), ('left', 1), ('dropped', 1), ('there', 1), ('?', 1), ('mary', 1), ('back', 1), ('travelled', 1), ('journeyed', 1), ('milk', 1), ('no', 1), ('.', 1), ('daniel', 1), ('the', 1), ('bedroom', 1)])


In [20]:
train_story_text = []
train_question_text = []
train_answers = []

In [21]:
for story, question, answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)

In [22]:
print(train_story_text[:3])

[['Mary', 'moved', 'to', 'the', 'bathroom', '.', 'Sandra', 'journeyed', 'to', 'the', 'bedroom', '.'], ['Mary', 'moved', 'to', 'the', 'bathroom', '.', 'Sandra', 'journeyed', 'to', 'the', 'bedroom', '.', 'Mary', 'went', 'back', 'to', 'the', 'bedroom', '.', 'Daniel', 'went', 'back', 'to', 'the', 'hallway', '.'], ['Mary', 'moved', 'to', 'the', 'bathroom', '.', 'Sandra', 'journeyed', 'to', 'the', 'bedroom', '.', 'Mary', 'went', 'back', 'to', 'the', 'bedroom', '.', 'Daniel', 'went', 'back', 'to', 'the', 'hallway', '.', 'Sandra', 'went', 'to', 'the', 'kitchen', '.', 'Daniel', 'went', 'back', 'to', 'the', 'bathroom', '.']]


In [23]:
# get the index of each word in the vocab
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [24]:
print(train_story_seq[:3])

[[28, 12, 22, 36, 3, 34, 9, 31, 22, 36, 37, 34], [28, 12, 22, 36, 3, 34, 9, 31, 22, 36, 37, 34, 28, 21, 29, 22, 36, 37, 34, 35, 21, 29, 22, 36, 14, 34], [28, 12, 22, 36, 3, 34, 9, 31, 22, 36, 37, 34, 28, 21, 29, 22, 36, 37, 34, 35, 21, 29, 22, 36, 14, 34, 9, 21, 22, 36, 18, 34, 35, 21, 29, 22, 36, 3, 34]]


In [25]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len, max_question_len=max_question_len):
    # stories
    X = []
    # questions
    Xq = []
    # answers
    Y = []
    
    for story, question, answer in data:
        x = [word_index[word.lower()] for word in story]
        xq = [word_index[word.lower()] for word in question]
        y = np.zeros(len(word_index) + 1)
        y[word_index[answer]] = 1
        
        X.append(x) 
        Xq.append(xq) 
        Y.append(y) 
    
    return (pad_sequences(X, maxlen=max_story_len), pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

In [26]:
stories_train, questions_train, answers_test = vectorize_stories(train_data)
stories_test, questions_test, answers_test = vectorize_stories(test_data)

In [27]:
answers_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
tokenizer.word_index['yes']

20

In [29]:
tokenizer.word_index['no']

33

In [30]:
sum(answers_test)

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 497.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
       503.,   0.,   0.,   0.,   0.])