# Preprocessing

In [1]:
import pickle
import numpy as np

In [2]:
with open('train_qa.txt', 'rb') as f:
    train_data = pickle.load(f)

In [7]:
with open('test_qa.txt', 'rb') as f:
    test_data = pickle.load(f)

In [8]:
type(test_data)

list

In [9]:
test_data[0]

(['Mary',
  'got',
  'the',
  'milk',
  'there',
  '.',
  'John',
  'moved',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'John', 'in', 'the', 'kitchen', '?'],
 'no')

In [10]:
len(train_data)

10000

In [11]:
len(test_data)

1000

In [13]:
story = train_data[0][0]
' '.join(story)

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [14]:
question = train_data[0][1]
' '.join(question)

'Is Sandra in the hallway ?'

In [15]:
answer = train_data[0][2]
answer

'no'

In [16]:
# We need to set up a vocabulary of all of our words in the dataset.
all_data = test_data + train_data

In [18]:
vocab = set()

# Use a set for unique words.
for story, question, answer in all_data:
    vocab = vocab.union(set(story)) # .union is a set() function. Find unique between two sets...
    vocab = vocab.union(set(question))
    

In [19]:
vocab.add('no')

In [20]:
vocab.add('yes')

In [21]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [22]:
vocab_length = len(vocab) + 1 # Because of the keras placeholder

In [23]:
vocab_length

38

In [24]:
# How long is the longest story and longest question??
longest_story = max([len(data[0]) for data in all_data])
longest_question = max([len(data[1]) for data in all_data])

In [25]:
longest_story

156

In [26]:
longest_question

6

# Vectorise the data

In [29]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [30]:
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab) # Vocab object comes into play

In [32]:
tokenizer.word_index # Lower cased for us!

{'office': 1,
 'milk': 2,
 'back': 3,
 'daniel': 4,
 'put': 5,
 'the': 6,
 'got': 7,
 'took': 8,
 'yes': 9,
 'picked': 10,
 'in': 11,
 'up': 12,
 'left': 13,
 'to': 14,
 'sandra': 15,
 'john': 16,
 'grabbed': 17,
 'kitchen': 18,
 '?': 19,
 'is': 20,
 'football': 21,
 'no': 22,
 '.': 23,
 'garden': 24,
 'discarded': 25,
 'went': 26,
 'there': 27,
 'bedroom': 28,
 'bathroom': 29,
 'travelled': 30,
 'apple': 31,
 'mary': 32,
 'moved': 33,
 'journeyed': 34,
 'hallway': 35,
 'down': 36,
 'dropped': 37}

## Tokenize stories, questions and answers

In [34]:
train_story_text = []
train_question_text = []
train_answer_text = []

for story, question, answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answer_text.append(answer)

In [35]:
# Convert the text arrays to arrays of index calls...
train_story_sequences = tokenizer.texts_to_sequences(train_story_text)

In [37]:
train_story_text

[['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.',
  'Mary',
  'went',
  'back',
  'to',
  'the',
  'bedroom',
  '.',
  'Daniel',
  'went',
  'back',
  'to',
  'the',
  'hallway',
  '.'],
 ['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.',
  'Mary',
  'went',
  'back',
  'to',
  'the',
  'bedroom',
  '.',
  'Daniel',
  'went',
  'back',
  'to',
  'the',
  'hallway',
  '.',
  'Sandra',
  'went',
  'to',
  'the',
  'kitchen',
  '.',
  'Daniel',
  'went',
  'back',
  'to',
  'the',
  'bathroom',
  '.'],
 ['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.',
  'Mary',
  'went',
  'back',
  'to',
  'the',
  'bedroom',
  '.',
  'Daniel',
  'went',
  'back',
  'to

In [38]:
train_story_sequences

[[32, 33, 14, 6, 29, 23, 15, 34, 14, 6, 28, 23],
 [32,
  33,
  14,
  6,
  29,
  23,
  15,
  34,
  14,
  6,
  28,
  23,
  32,
  26,
  3,
  14,
  6,
  28,
  23,
  4,
  26,
  3,
  14,
  6,
  35,
  23],
 [32,
  33,
  14,
  6,
  29,
  23,
  15,
  34,
  14,
  6,
  28,
  23,
  32,
  26,
  3,
  14,
  6,
  28,
  23,
  4,
  26,
  3,
  14,
  6,
  35,
  23,
  15,
  26,
  14,
  6,
  18,
  23,
  4,
  26,
  3,
  14,
  6,
  29,
  23],
 [32,
  33,
  14,
  6,
  29,
  23,
  15,
  34,
  14,
  6,
  28,
  23,
  32,
  26,
  3,
  14,
  6,
  28,
  23,
  4,
  26,
  3,
  14,
  6,
  35,
  23,
  15,
  26,
  14,
  6,
  18,
  23,
  4,
  26,
  3,
  14,
  6,
  29,
  23,
  4,
  10,
  12,
  6,
  21,
  27,
  23,
  4,
  26,
  14,
  6,
  28,
  23],
 [32,
  33,
  14,
  6,
  29,
  23,
  15,
  34,
  14,
  6,
  28,
  23,
  32,
  26,
  3,
  14,
  6,
  28,
  23,
  4,
  26,
  3,
  14,
  6,
  35,
  23,
  15,
  26,
  14,
  6,
  18,
  23,
  4,
  26,
  3,
  14,
  6,
  29,
  23,
  4,
  10,
  12,
  6,
  21,
  27,
  23,
  4,
  26,
  14,

In [41]:
# Functionise...
def vectorise_stories(
    data, 
    word_index_dict=tokenizer.word_index,
    max_story_len=longest_story,
    max_question_len=longest_question
):
    X_stories = []
    X_questions = []
    y_answers = []
    
    for story, question, answer in data:
        # Look like [23, 12, 1, 0, 5, 6, 7, 1, 3]
        x_story = [word_index_dict[word.lower()] for word in story]
        x_question = [word_index_dict[word.lower()] for word in question]
        y = np.zeros(len(word_index_dict) + 1)
        
        y[word_index_dict[answer]] = 1
        X_stories.append(x_story)
        X_questions.append(x_question)
        y_answers.append(y)
    
    # Now the sequences created need to be padded.
    
    return (
        pad_sequences(X_stories, maxlen=max_story_len),
        pad_sequences(X_questions, maxlen=max_question_len),
        np.array(y_answers)
    )

In [85]:
inputs_train, outputs_train, answer_train = vectorise_stories(train_data)

In [86]:
inputs_test, outputs_test, answer_test = vectorise_stories(test_data)

In [44]:
inputs_train

array([[ 0,  0,  0, ...,  6, 28, 23],
       [ 0,  0,  0, ...,  6, 35, 23],
       [ 0,  0,  0, ...,  6, 29, 23],
       ...,
       [ 0,  0,  0, ...,  6, 28, 23],
       [ 0,  0,  0, ...,  2, 27, 23],
       [ 0,  0,  0, ..., 31, 27, 23]], dtype=int32)

In [47]:
inputs_test[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0, 32,  7,  6,  2, 27, 23, 16, 33, 14,
        6, 28, 23], dtype=int32)

In [49]:
tokenizer.word_index['no']

22

In [50]:
tokenizer.word_index['yes']

9

In [52]:
sum(answer_test) # 497 yesses and 503 nos

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 497.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
       503.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.])

# Build the model

In [54]:
from keras.models import Sequential, Model

In [56]:
from keras.layers import Embedding

In [57]:
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM

In [58]:
# Input is used to build a keras tensor
# Placeholder = (longest_story, batch_size), we do not know the batch size yet.
input_sequence = Input((longest_story, ))
question = Input((longest_question, ))

# Input encoders.
vocab_size = len(vocab) + 1

"""
Input encoder M.
"""
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64))
# Dropout layer turns off a percentage of neurons off randomly during training to prevent overfitting.
input_encoder_m.add(Dropout(0.3))
# This layer will output (samples, story_max_length, embedding_dimensions)

In [59]:
"""
Input encoder C.
"""
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size, output_dim=longest_question))
# Dropout layer turns off a percentage of neurons off randomly during training to prevent overfitting.
input_encoder_c.add(Dropout(0.3))
# This layer will output (samples, story_max_length, question_max_length)

In [61]:
"""
Question encoder.
"""
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=longest_question))
# Dropout layer turns off a percentage of neurons off randomly during training to prevent overfitting.
question_encoder.add(Dropout(0.3))
# This layer will output (samples, question_max_length, embedding_dimension)

In [62]:
"""
Encode the sequences.
"""

# Encoded <-- Encoder(input)
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [63]:
match = dot([input_encoded_m, question_encoded], axes=(2, 2))

In [64]:
match = Activation('softmax')(match)

In [66]:
response = add([match, input_encoded_c])
response = Permute((2, 1))(response)

In [70]:
answer_tensor = concatenate((response, question_encoded))

In [71]:
answer_tensor

<KerasTensor: shape=(None, 6, 220) dtype=float32 (created by layer 'concatenate_1')>

In [72]:
answer = LSTM(32)(answer_tensor)

In [73]:
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer) # (samples, vocab_size)

In [74]:
# Output probability distribution
answer = Activation('softmax')(answer)

In [76]:
model = Model([input_sequence, question], answer)

In [77]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [78]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 156)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 6)]                  0         []                            
                                                                                                  
 sequential (Sequential)     (None, None, 64)             2432      ['input_1[0][0]']             
                                                                                                  
 sequential_3 (Sequential)   (None, 6, 64)                2432      ['input_2[0][0]']             
                                                                                              

# Train and fit the model

In [87]:
history = model.fit(
    [inputs_train, outputs_train], 
    answer_train, 
    batch_size=32, 
    epochs=100, 
    validation_data=((inputs_test, outputs_test), answer_test)
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [88]:
import matplotlib.pyplot as plt
%matplotlib inline
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])


KeyError: 'acc'

In [89]:
model.save('mycrazymodel.keras')

In [92]:
pred_results = model.predict(([inputs_test, outputs_test]))



In [93]:
test_data[0][0]

['Mary',
 'got',
 'the',
 'milk',
 'there',
 '.',
 'John',
 'moved',
 'to',
 'the',
 'bedroom',
 '.']

In [94]:
pred_results

array([[1.11659145e-11, 1.08382400e-11, 1.09521602e-11, ...,
        1.20786679e-11, 1.22734895e-11, 1.06860536e-11],
       [2.22124489e-12, 2.16601043e-12, 1.91340151e-12, ...,
        2.39321475e-12, 2.41088096e-12, 2.23320754e-12],
       [7.03772707e-10, 7.63578201e-10, 7.79901865e-10, ...,
        6.96680824e-10, 7.02694292e-10, 6.99087510e-10],
       ...,
       [5.62074796e-11, 5.61666477e-11, 4.94631038e-11, ...,
        5.94803859e-11, 6.03651226e-11, 5.68877237e-11],
       [1.13910936e-10, 9.88899795e-11, 1.03289675e-10, ...,
        1.06976566e-10, 1.27292982e-10, 1.11885744e-10],
       [7.26567140e-11, 6.69950415e-11, 5.76670343e-11, ...,
        6.19929039e-11, 7.91482574e-11, 7.16600043e-11]], dtype=float32)

In [95]:
pred_results.shape

(1000, 38)

In [96]:
pred = np.argmax(pred_results[0])

In [97]:
pred

22

In [98]:
for key, val in tokenizer.word_index.items():
    if (val == pred): k = key

In [99]:
k

'no'

In [100]:
# How sure are you?
pred_results[0][22]

0.9808879

In [101]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [115]:
my_story = 'John left the kitchen . Sandra dropped the football in the garden . ' # we need spaces around punct

In [116]:
my_story.split()

['John',
 'left',
 'the',
 'kitchen',
 '.',
 'Sandra',
 'dropped',
 'the',
 'football',
 'in',
 'the',
 'garden',
 '.']

In [117]:
my_question = 'Is the football in the garden .'
my_question.split()

['Is', 'the', 'football', 'in', 'the', 'garden', '.']

In [121]:
mydata = [
    ((my_story.split()), (my_question.split()), 'yes')
]

In [122]:
mydata

[(['John',
   'left',
   'the',
   'kitchen',
   '.',
   'Sandra',
   'dropped',
   'the',
   'football',
   'in',
   'the',
   'garden',
   '.'],
  ['Is', 'the', 'football', 'in', 'the', 'garden', '.'],
  'yes')]

In [123]:
s, q, a = vectorise_stories(mydata)

In [125]:
# Predict off story and question...
pred_results = model.predict(([s, q]))



In [132]:
pred = np.argmax(pred_results[0])
for key, val in tokenizer.word_index.items():
    if (val == pred): k = key

In [133]:
val

37

In [134]:
k

'yes'

In [136]:
pred_results[0][pred]

0.95464283