In [1]:
from __future__ import print_function
from functools import reduce
import re
import tarfile

import numpy as np
import os as os
import json

from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers, callbacks, models
from keras.layers import recurrent
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Flatten


from keras import regularizers

Using TensorFlow backend.


In [2]:
def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]


def parse_stories(lines, only_supporting=False):
    '''Parse stories provided in the bAbi tasks format
    If only_supporting is true,
    only the sentences that support the answer are kept.
    '''
    data = []
    story = []
    for line in lines:
        line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data


def get_stories(f, only_supporting=False, max_length=None):
    '''Given a file name, read the file, retrieve the stories,
    and then convert the sentences into a single story.
    If max_length is supplied,
    any stories longer than max_length tokens will be discarded.
    '''
    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [(flatten(story), q, answer) for story, q, answer in data
            if not max_length or len(flatten(story)) < max_length]
    return data


def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    xs = []
    xqs = []
    ys = []
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        xq = [word_idx[w] for w in query]
        # let's not forget that index 0 is reserved
        y = np.zeros(len(word_idx) + 1)
        y[word_idx[answer]] = 1
        xs.append(x)
        xqs.append(xq)
        ys.append(y)
    return (pad_sequences(xs, maxlen=story_maxlen),
            pad_sequences(xqs, maxlen=query_maxlen), np.array(ys))





In [3]:
RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 128
EPOCHS = 40
# Regularization parameter
LAMBDA = 0.01
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN,
                                                           EMBED_HIDDEN_SIZE,
                                                           SENT_HIDDEN_SIZE,
                                                           QUERY_HIDDEN_SIZE))

try:
    path = get_file('babi-tasks-v1-2.tar.gz',
                    origin='https://s3.amazonaws.com/text-datasets/'
                           'babi_tasks_1-20_v1-2.tar.gz')
except:
    print('Error downloading dataset, please download it manually:\n'
          '$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2'
          '.tar.gz\n'
          '$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz')
    raise




RNN / Embed / Sent / Query = <class 'keras.layers.recurrent.LSTM'>, 50, 100, 100


In [4]:
# Default QA1 with 1000 samples
# challenge = 'tasks_1-20_v1-2/en/qa1_single-supporting-fact_{}.txt'
# QA1 with 10,000 samples
# challenge = 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt'
# QA2 with 1000 samples
challenge = 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt'
# QA2 with 10,000 samples
# challenge = 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt'
with tarfile.open(path) as tar:
    train = get_stories(tar.extractfile(challenge.format('train')))
    test = get_stories(tar.extractfile(challenge.format('test')))
np.random.shuffle(train)
np.random.shuffle(test)
vocab = set()
for story, q, answer in train + test:
    vocab |= set(story + q + [answer])
vocab = sorted(vocab)

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
story_maxlen = max(map(len, (x for x, _, _ in train + test)))
query_maxlen = max(map(len, (x for _, x, _ in train + test)))

x, xq, y = vectorize_stories(train, word_idx, story_maxlen, query_maxlen)
tx, txq, ty = vectorize_stories(test, word_idx, story_maxlen, query_maxlen)

print('vocab = {}'.format(vocab))
print('x.shape = {}'.format(x.shape))
print('xq.shape = {}'.format(xq.shape))
print('y.shape = {}'.format(y.shape))
print('story_maxlen, query_maxlen = {}, {}'.format(story_maxlen, query_maxlen))


  return _compile(pattern, flags).split(string, maxsplit)


vocab = ['.', '?', 'Daniel', 'John', 'Mary', 'Sandra', 'Where', 'apple', 'back', 'bathroom', 'bedroom', 'discarded', 'down', 'dropped', 'football', 'garden', 'got', 'grabbed', 'hallway', 'is', 'journeyed', 'kitchen', 'left', 'milk', 'moved', 'office', 'picked', 'put', 'the', 'there', 'to', 'took', 'travelled', 'up', 'went']
x.shape = (1000, 552)
xq.shape = (1000, 5)
y.shape = (1000, 36)
story_maxlen, query_maxlen = 552, 5


In [5]:
print("Building the embedding matrix...")
GLOVE_PATH = '..\\Embeddings'


f = open(os.path.join(GLOVE_PATH,"glove.6B.{}d.txt".format(EMBED_HIDDEN_SIZE)), 'r', encoding = "ANSI")
embeddings_index = {}
for line in f:
    values = line.split(" ")
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except ValueError:
        print(values[1:])
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((len(word_idx) + 1, EMBED_HIDDEN_SIZE))
for word, i in word_idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Found %s word vectors.' % len(embeddings_index))

Building the embedding matrix...
Found 400000 word vectors.


In [9]:
print('Build model...')



question_Input = layers.Input(shape=xq[0].shape, name='question_Input')
story_Input = layers.Input(shape=x[0].shape, name='story_Input')

#Embed question
q_Embedding = layers.Embedding(input_dim = vocab_size, output_dim = EMBED_HIDDEN_SIZE, \
                               weights = [embedding_matrix], input_length = query_maxlen)(question_Input)
# Bidirectional GRU (optimal dropout approx 0.4 without regularization)
q_Encode = layers.Bidirectional(recurrent.GRU(EMBED_HIDDEN_SIZE, return_sequences=True,\
                                              kernel_regularizer = regularizers.l2(LAMBDA), dropout=0.3))(q_Embedding)
q_Encode = layers.Reshape((query_maxlen, 2*EMBED_HIDDEN_SIZE))(q_Encode)

#Embed story
s_Embedding = layers.Embedding(input_dim = vocab_size, output_dim = EMBED_HIDDEN_SIZE, \
                               weights = [embedding_matrix], input_length = story_maxlen)(story_Input)
# Bidirectional GRU (optimal dropout approx 0.4 without regularization)
s_Encode = layers.Bidirectional(recurrent.GRU(EMBED_HIDDEN_SIZE, return_sequences=True, \
                                              kernel_regularizer = regularizers.l2(LAMBDA), dropout=0.3))(s_Embedding)
s_Encode = layers.Reshape((story_maxlen, 2*EMBED_HIDDEN_SIZE))(s_Encode)

# Attention Layer
# Multiply between context and query to form attention
# Resultant matrix should be MxN, taking in Mxd and Nxd 
# embedded question/answer matrices where d is 2*EMBED_HIDDEN_SIZE
dot_merge = layers.Dot(axes = [2,2])([s_Encode, q_Encode])

# Flatten and compute softmax for each attent distro
flat = Flatten()(dot_merge)
dense = layers.Dense(query_maxlen * story_maxlen, kernel_regularizer = regularizers.l2(LAMBDA))(flat)
# act = layers.Activation("softmax")(dense)
act = layers.Activation("softmax")(dot_merge)


# Reshape back into the original dimensions (MxN)
act_resh = layers.Reshape((story_maxlen, query_maxlen), input_shape=(query_maxlen,))(act)
# Compute attention output as an element-wise multiplication
attn_out = layers.Dot(axes=[2,1])([act_resh, q_Encode])
# Next we concatenate to form a blended representation of the same dimension as an encoded question,
# of which there exists one for every given context hidden state. Should be 4H x 2N
blended = layers.Concatenate(axis=2)([s_Encode, attn_out])
flat2 = Flatten()(blended)
#relu = layers.Activation("relu")(flat2)

#####
# TODO: Finish Logit + fully connected layer for RELU 
relu = layers.Dense(EMBED_HIDDEN_SIZE, activation = "relu")(flat2)
#logit = layers.Dense(1)(relu)
#####
#print(relu.get_shape(), logit.get_shape())


dense2 = layers.Dense(vocab_size, activation = "softmax", kernel_regularizer = regularizers.l2(LAMBDA))(relu)
######
# TODO: Add vanilla softmax at output (no weight vector here, i.e. no Dense) 
# dense2 = layers.Activation("softmax")(logit)
#####

model = Model(inputs=[story_Input, question_Input], outputs = [dense2])
print(model.summary())

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


Build model...
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
story_Input (InputLayer)         (None, 552)           0                                            
____________________________________________________________________________________________________
question_Input (InputLayer)      (None, 5)             0                                            
____________________________________________________________________________________________________
embedding_4 (Embedding)          (None, 552, 50)       1800        story_Input[0][0]                
____________________________________________________________________________________________________
embedding_3 (Embedding)          (None, 5, 50)         1800        question_Input[0][0]             
____________________________________________________________________________

In [10]:
print('Training')
print(model.summary())
outpath = "../Outputs/baseline.h5"
saverCallback = callbacks.ModelCheckpoint(filepath = outpath, monitor = "val_loss", verbose = 1)
history = model.fit([x, xq], y,
          batch_size=BATCH_SIZE,
          epochs=100,
          validation_split=0.05,
          callbacks = [saverCallback])
loss, acc = model.evaluate([tx, txq], ty,
                           batch_size=BATCH_SIZE)

hist_out = "../Outputs/baseline_history.json"
with open(hist_out, 'w') as histFile:
    json.dump(history.history, histFile)
    
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))

Training
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
story_Input (InputLayer)         (None, 552)           0                                            
____________________________________________________________________________________________________
question_Input (InputLayer)      (None, 5)             0                                            
____________________________________________________________________________________________________
embedding_4 (Embedding)          (None, 552, 50)       1800        story_Input[0][0]                
____________________________________________________________________________________________________
embedding_3 (Embedding)          (None, 5, 50)         1800        question_Input[0][0]             
__________________________________________________________________________________

Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100


Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100


Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Test loss / test accuracy = 4.5682 / 0.2600


In [28]:
y_pred = model.predict([tx, txq],
                           batch_size=BATCH_SIZE)

In [29]:
print(ty.shape)
print(y_pred.shape)


(1000, 36)
(1000, 36)


In [75]:
y_pred_tar = y_pred
for i in range(len(y_pred_tar)):
    y_pred_tar[i][y_pred_tar[i] >= max(y_pred_tar[i])] = 1
    y_pred_tar[i][y_pred_tar[i] < max(y_pred_tar[i])] = 0
print(y_pred_tar[0])

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [76]:
total = ty.shape[0]
EM = 0
for i, j in zip(ty, y_pred_tar):
    if np.argmax(i) == np.argmax(j):
        EM += 1
print(str(EM) + '/' + str(total))

187/1000


In [51]:
numones = 0
for example in ty:
    for index in example:
        if index == 1:
            numones+=1
print(numones)

1000


In [52]:
print(train)

[(['Mary', 'moved', 'to', 'the', 'bathroom', '.', 'Sandra', 'journeyed', 'to', 'the', 'bedroom', '.', 'Mary', 'got', 'the', 'football', 'there', '.', 'John', 'went', 'to', 'the', 'kitchen', '.', 'Mary', 'went', 'back', 'to', 'the', 'kitchen', '.', 'Mary', 'went', 'back', 'to', 'the', 'garden', '.'], ['Where', 'is', 'the', 'football', '?'], 'garden'), (['Mary', 'moved', 'to', 'the', 'bathroom', '.', 'Sandra', 'journeyed', 'to', 'the', 'bedroom', '.', 'Mary', 'got', 'the', 'football', 'there', '.', 'John', 'went', 'to', 'the', 'kitchen', '.', 'Mary', 'went', 'back', 'to', 'the', 'kitchen', '.', 'Mary', 'went', 'back', 'to', 'the', 'garden', '.', 'Sandra', 'went', 'back', 'to', 'the', 'office', '.', 'John', 'moved', 'to', 'the', 'office', '.', 'Sandra', 'journeyed', 'to', 'the', 'hallway', '.', 'Daniel', 'went', 'back', 'to', 'the', 'kitchen', '.', 'Mary', 'dropped', 'the', 'football', '.', 'John', 'got', 'the', 'milk', 'there', '.'], ['Where', 'is', 'the', 'football', '?'], 'garden'), ([

In [None]:
#print('Build model...')

# Encode the question and answer layers sequentially
# first embed using the embedding weights, then pass to the LSTM
#encoded_question = Sequential()
#q_embedding = layers.Embedding(input_dim = vocab_size, output_dim = EMBED_HIDDEN_SIZE, weights = [embedding_matrix], input_length = story_maxlen)
#encoded_question.add(q_embedding)
#encoded_question.add(RNN(EMBED_HIDDEN_SIZE, return_sequences=True))
#encoded_question.add(layers.Dropout(0.3))

#encoded_sentence = Sequential()
#s_embedding = layers.Embedding(input_dim = vocab_size, output_dim = EMBED_HIDDEN_SIZE, weights = [embedding_matrix], input_length = story_maxlen)
#encoded_sentence.add(s_embedding)
#encoded_sentence.add(RNN(EMBED_HIDDEN_SIZE, return_sequences=True))
#encoded_sentence.add(layers.Dropout(0.3))


# Attention layer as the dot product of embedded questions and answers
#attn = Sequential()
#attn.add(Merge([encoded_sentence, encoded_question], mode="dot", dot_axes=[1, 1]))
#attn.add(Flatten())
#attn.add(layers.Dense(EMBED_HIDDEN_SIZE * story_maxlen))
#attn.add(layers.Reshape((story_maxlen, EMBED_HIDDEN_SIZE)))

# Compile
#model = Sequential()
#model.add(Merge([encoded_question, attn], mode = "sum"))
#model.add(Flatten())
#model.add(layers.Dense(len(word_idx) + 1, activation = "softmax"))
#model.compile(optimizer='adam',
#              loss='categorical_crossentropy',
#              metrics=['accuracy'])

In [25]:
foo = models.load_model(outpath)

In [32]:
foo.history

AttributeError: 'Model' object has no attribute 'history'

In [30]:
type(model)

keras.engine.training.Model

In [None]:
print(tx)
article = ''
question = ''
print(word_idx)
inv_word_idx = {v: k for k, v in word_idx.items()}
print(inv_word_idx)
for word in tx[4]:
    if word != 0:
        article += inv_word_idx[word] + ' '
for word in txq[4]:
    if word != 0:
        question += inv_word_idx[word] + ' '
print(article)
print(question)
print(inv_word_idx[np.argmax(ty[4])])
print(inv_word_idx[np.argmax(y_pred_tar[4])])