In [124]:
from __future__ import print_function
import numpy as np
from rnn_utils import *   #Keras imports done here

import csv

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
#Parse get the data in (story, q, a) format. The story is flattened.
train, test = get_data()


In [4]:
#Build the vocabulary
vocab = set()
for story, q, answer in train + test:  #+ => extend
    vocab |= set(story + q + [answer])
vocab = sorted(vocab)
# Reserve 0 for masking via pad_sequences - Keras requirement
vocab_size = len(vocab) + 1

In [10]:
#For Lookup in Word Embedding Matrix
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
#Needed for padding the story and query to max lengths and vectorizing them
story_maxlen = max(map(len, (x for x, _, _ in train + test)))  #max words in a story
query_maxlen = max(map(len, (x for _, x, _ in train + test)))
print("Max words in a question, story::", story_maxlen,",", query_maxlen)

Max words in a question, story:: 156 , 6


In [12]:
#x is list of supportstory sentences for each example: A sentecnce here is list of word indexes
#y is a on-hot representation over entire vocab space. bAbI vocab space is quite less:37
x, xq, y = vectorize_stories(train, word_idx, story_maxlen, query_maxlen)
tx, txq, ty = vectorize_stories(test, word_idx, story_maxlen, query_maxlen)

print('x.shape = {}'.format(x.shape))
print('xq.shape = {}'.format(xq.shape))
print('y.shape = {}'.format(y.shape))

x.shape = (1000, 156)
xq.shape = (1000, 6)
y.shape = (1000, 38)


In [72]:
RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 32
EPOCHS = 40

In [107]:
#Loading GLOVECS
GLOVE_path = "./raw/glove.6B/glove.6B.50d.txt"
reader = csv.reader(open(GLOVE_path), delimiter=' ', quoting=csv.QUOTE_NONE) 
GLOVES = {line[0]: np.array(list(map(float, line[1: ]))) for line in reader}

In [108]:
glove_embed_matrix = np.random.randn(vocab_size, EMBED_HIDDEN_SIZE)
for word in vocab:
    idx = word_idx[word]
    glove_embed_matrix[idx,:] = GLOVES.get(word, np.random.randn(EMBED_HIDDEN_SIZE))

In [112]:
#Model construction -- Question repr as an input
sentence = layers.Input(shape=(story_maxlen,), dtype='int32')
encoded_sentence = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE, weights=[glove_embed_matrix], trainable=True)(sentence)
#encoded_sentence.shape is (Batch_size, story_maxlen, EMBEDDING_SIZE)
encoded_sentence = layers.Dropout(0.3)(encoded_sentence)

question = layers.Input(shape=(query_maxlen,), dtype='int32')
encoded_question = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE, weights=[glove_embed_matrix], trainable=True)(question)
encoded_question = layers.Dropout(0.3)(encoded_question)
encoded_question = RNN(EMBED_HIDDEN_SIZE)(encoded_question)
encoded_question = layers.RepeatVector(story_maxlen)(encoded_question)
# So the example reads everything into a single vector, then uses that vector to reconstruct the original sequence. If you want to iteratively
# generate something but you only have one input, you can repeat the vector. That means each time step will get the same input but a different
# hidden state.

#Encoded question is added(literally) to every word input at every timestep
merged = layers.add([encoded_sentence, encoded_question])
merged = RNN(EMBED_HIDDEN_SIZE)(merged)
merged = layers.Dropout(0.3)(merged)
preds = layers.Dense(vocab_size, activation='softmax')(merged)

model = Model([sentence, question], preds)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [119]:
print('Training')
model.fit([x, xq], y,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.05)
loss, acc = model.evaluate([tx, txq], ty, batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))

Training
Train on 950 samples, validate on 50 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss / test accuracy = 0.6589 / 0.6110


In [97]:
RNN_2 = recurrent.GRU

#Model - Merging two separate state representations of Story and Question
question = layers.Input(shape=(query_maxlen,), dtype='int32')
encoded_question = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(question)
# encoded_question = layers.Dropout(0.3)(encoded_question)
encoded_question1 = RNN(QUERY_HIDDEN_SIZE)(encoded_question)

story = layers.Input(shape=(story_maxlen,), dtype='int32')
encoded_story = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(story)
# encoded_question = layers.Dropout(0.3)(encoded_question)
encoded_story1 = RNN(SENT_HIDDEN_SIZE)(encoded_story)

merged_state_op = layers.add([encoded_story1, encoded_question1])
preds = layers.Dense(vocab_size, activation='softmax')(merged_state_op)

model2 = Model([story, question], preds)
model2.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [100]:
print("Training")
model2.fit([x, xq], y,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.05)
loss, acc = model.evaluate([tx, txq], ty, batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))

Training
Train on 950 samples, validate on 50 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss / test accuracy = 0.7911 / 0.4970


In [122]:
t_probs = model.predict([tx, txq], batch_size=None, verbose=0, steps=None)
t_preds = np.argmax(t_probs, axis=1)
# vocab[25]
frmt = "Pred:{}-Truth:{}\t"
fail = []
for idx, i_pred in enumerate(t_preds):
    g_id = np.argmax(ty[idx])
    if i_pred != g_id: fail.append(idx)
    print(frmt.format(vocab[i_pred-1], vocab[g_id-1]), end="")


Pred:no-Truth:no	Pred:no-Truth:no	Pred:yes-Truth:yes	Pred:yes-Truth:yes	Pred:no-Truth:no	Pred:no-Truth:no	Pred:no-Truth:no	Pred:no-Truth:yes	Pred:no-Truth:no	Pred:no-Truth:yes	Pred:no-Truth:yes	Pred:yes-Truth:no	Pred:no-Truth:yes	Pred:no-Truth:yes	Pred:no-Truth:yes	Pred:no-Truth:yes	Pred:no-Truth:yes	Pred:no-Truth:no	Pred:no-Truth:no	Pred:no-Truth:no	Pred:yes-Truth:yes	Pred:no-Truth:no	Pred:no-Truth:yes	Pred:no-Truth:no	Pred:no-Truth:no	Pred:yes-Truth:yes	Pred:yes-Truth:yes	Pred:yes-Truth:yes	Pred:no-Truth:no	Pred:no-Truth:yes	Pred:yes-Truth:yes	Pred:no-Truth:no	Pred:yes-Truth:no	Pred:no-Truth:no	Pred:yes-Truth:no	Pred:no-Truth:no	Pred:no-Truth:yes	Pred:yes-Truth:yes	Pred:yes-Truth:yes	Pred:no-Truth:no	Pred:no-Truth:yes	Pred:no-Truth:no	Pred:no-Truth:no	Pred:no-Truth:no	Pred:no-Truth:yes	Pred:yes-Truth:yes	Pred:no-Truth:yes	Pred:no-Truth:no	Pred:no-Truth:yes	Pred:no-Truth:yes	Pred:no-Truth:yes	Pred:no-Truth:yes	Pred:yes-Truth:yes	Pred:yes-Truth:yes	Pred:no-Truth:yes	Pred:no-Truth:no	Pr

In [123]:
print("Total Failed:", len(fail))    
fail = np.array(fail)
idxs = np.random.choice(fail, 3)
for idx in idxs:
    s, q, a = test[idx]
    s = " ".join(_.decode('ascii') for _ in s)
    q = " ".join(_.decode('ascii') for _ in q)
    print("-->", s, q, a)

Total Failed: 389
--> John moved to the office . Daniel journeyed to the garden . John picked up the milk there . Sandra took the football there . Mary went back to the kitchen . Daniel journeyed to the bedroom . John left the milk . Mary journeyed to the bathroom . Mary moved to the bedroom . Daniel went back to the office . Is Daniel in the office ? yes
--> Mary went to the garden . Mary went to the bedroom . Sandra went back to the garden . Daniel grabbed the milk there . Mary journeyed to the hallway . John travelled to the kitchen . John went back to the garden . Sandra went back to the bedroom . Sandra moved to the kitchen . Daniel put down the milk . Is Sandra in the kitchen ? yes
--> Mary went back to the garden . Daniel journeyed to the hallway . Mary went to the bathroom . Daniel went back to the bathroom . Is Daniel in the kitchen ? no
