In [1]:
from __future__ import print_function
from functools import reduce
import re
import tarfile

import numpy as np
import os as os
import json

from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers, callbacks, models
from keras.layers import recurrent
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Flatten
from datetime import datetime
from keras_self_attention import SeqSelfAttention

from keras import regularizers

import keras.backend as K

import collections

Using TensorFlow backend.


In [2]:
def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]


def parse_stories(lines):
    data = []
    story = []
    for line in lines:
        line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        if int(nid) == 1: story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            substory = [[str(i)+":"]+x for i,x in enumerate(story) if x]
            data.append((substory, q, a))
            story.append('')
        else: story.append(tokenize(line))
    return data


def get_stories(f):
    data = parse_stories(f.readlines())
    return [(story, q, answer) for story, q, answer in data]


def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    X = []; Xq = []; Y = []
    for story, query, answer in data:
        x = [[word_idx[w] for w in s] for s in story]
        xq = [word_idx[w] for w in query]
        y = [word_idx[answer]]
        X.append(x); Xq.append(xq); Y.append(y)
    return ([pad_sequences(x, maxlen=story_maxlen) for x in X],
            pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))

def do_flatten(el): 
    return isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes))
def flatten(l):
    for el in l:
        if do_flatten(el): yield from flatten(el)
        else: yield el
            
def stack_inputs(inputs):
    for i,it in enumerate(inputs):
        inputs[i] = np.concatenate([it, 
                           np.zeros((story_maxsents-it.shape[0],story_maxlen), 'int')])
    return np.stack(inputs)



In [3]:
RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 100
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 128
EPOCHS = 40
MODEL_NAME = "LSTM_SelfAttention"
# Regularization parameter
LAMBDA = 0.02
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN,
                                                           EMBED_HIDDEN_SIZE,
                                                           SENT_HIDDEN_SIZE,
                                                           QUERY_HIDDEN_SIZE))

try:
    path = get_file('babi-tasks-v1-2.tar.gz',
                    origin='https://s3.amazonaws.com/text-datasets/'
                           'babi_tasks_1-20_v1-2.tar.gz')
except:
    print('Error downloading dataset, please download it manually:\n'
          '$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2'
          '.tar.gz\n'
          '$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz')
    raise




RNN / Embed / Sent / Query = <class 'keras.layers.recurrent.LSTM'>, 100, 100, 100


In [4]:
# Default QA1 with 1000 samples
#challenge = 'tasks_1-20_v1-2/en/qa1_single-supporting-fact_{}.txt'
# QA1 with 10,000 samples
# challenge = 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt'
# QA2 with 1000 samples
#challenge = 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt'
# QA2 with 10,000 samples
challenge = 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt'
with tarfile.open(path) as tar:
    train_stories = get_stories(tar.extractfile(challenge.format('train')))
    test_stories = get_stories(tar.extractfile(challenge.format('test')))
np.random.shuffle(train_stories)
np.random.shuffle(test_stories)

stories = train_stories + test_stories

story_maxlen = max((len(s) for x, _, _ in stories for s in x))
story_maxsents = max((len(x) for x, _, _ in stories))
query_maxlen = max(len(x) for _, x, _ in stories)

vocab = sorted(set(flatten(stories)))
vocab.insert(0, '<PAD>')
vocab_size = len(vocab)
print(vocab_size)
print(story_maxlen)
print(story_maxsents)
print(query_maxlen)

word_idx = dict((c, i) for i, c in enumerate(vocab))

inputs_train, queries_train, answers_train = vectorize_stories(train_stories, 
     word_idx, story_maxlen, query_maxlen)
inputs_test, queries_test, answers_test = vectorize_stories(test_stories, 
     word_idx, story_maxlen, query_maxlen)

inputs_train = stack_inputs(inputs_train)
inputs_test = stack_inputs(inputs_test)

inps = [inputs_train, queries_train]
val_inps = [inputs_test, queries_test]


  return _compile(pattern, flags).split(string, maxsplit)


124
8
88
5


In [61]:
print("Building the embedding matrix...")
GLOVE_PATH = '..\\Embeddings'


f = open(os.path.join(GLOVE_PATH,"glove.6B.{}d.txt".format(EMBED_HIDDEN_SIZE)), 'r', encoding = "ANSI")
embeddings_index = {}
for line in f:
    values = line.split(" ")
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except ValueError:
        print(values[1:])
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((len(word_idx), EMBED_HIDDEN_SIZE))
for word, i in word_idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Found %s word vectors.' % len(embeddings_index))

Building the embedding matrix...
Found 400000 word vectors.


In [47]:
print('Build model...')

emb_dim = EMBED_HIDDEN_SIZE

question_Input = layers.Input(shape=(query_maxlen,), name='question_Input')
story_Input = layers.Input(shape=(story_maxsents, story_maxlen), name='story_Input')

#Embed question
q_Embedding = layers.Embedding(input_dim = vocab_size, output_dim = emb_dim, \
                                weights = [embedding_matrix], input_length = query_maxlen)(question_Input)
# Bidirectional GRU (optimal dropout approx 0.4 without regularization)
q_Encode = layers.Bidirectional(recurrent.LSTM(emb_dim, return_sequences=True,\
                                              kernel_regularizer = regularizers.l2(LAMBDA), dropout=0.3))(q_Embedding)
q_Encode = layers.Reshape((query_maxlen, 2*emb_dim))(q_Encode)

#Embed story
s_Embedding = layers.Embedding(input_dim = vocab_size, output_dim = emb_dim, \
                               weights = [embedding_matrix], input_length = (story_maxsents, story_maxlen))(story_Input)
s_Embedding = layers.Reshape((story_maxlen * story_maxsents, emb_dim))(s_Embedding)
# Bidirectional GRU (optimal dropout approx 0.4 without regularization)
s_Encode = layers.Bidirectional(recurrent.LSTM(emb_dim, return_sequences=True, \
                                              kernel_regularizer = regularizers.l2(LAMBDA), dropout=0.3))(s_Embedding)
s_Encode = layers.Reshape((story_maxlen*story_maxsents, 2*emb_dim))(s_Encode)

# Attention Layer
# Multiply between context and query to form attention
# Resultant matrix should be MxN, taking in Mxd and Nxd 
# embedded question/answer matrices where d is 2*EMBED_HIDDEN_SIZE
dot_merge = layers.Dot(axes = [2,2])([s_Encode, q_Encode])

# Flatten and compute softmax for each attent distro
flat = Flatten()(dot_merge)
dense = layers.Dense(query_maxlen * story_maxlen, kernel_regularizer = regularizers.l2(LAMBDA))(flat)
# act = layers.Activation("softmax")(dense)
act = layers.Activation("softmax")(dot_merge)


# Reshape back into the original dimensions (MxN)
act_resh = layers.Reshape((story_maxlen*story_maxsents, query_maxlen), input_shape=(query_maxlen,))(act)
# Compute attention output as an element-wise multiplication
attn_out = layers.Dot(axes=[2,1])([act_resh, q_Encode])
# Next we concatenate to form a blended representation of the same dimension as an encoded question,
# of which there exists one for every given context hidden state. Should be 4H x 2N
blended = layers.Concatenate(axis=2)([s_Encode, attn_out])
flat2 = Flatten()(blended)
#relu = layers.Activation("relu")(flat2)

#####
# TODO: Finish Logit + fully connected layer for RELU 
relu = layers.Dense(emb_dim, activation = "relu")(flat2)
#logit = layers.Dense(1)(relu)
#####
#print(relu.get_shape(), logit.get_shape())


dense2 = layers.Dense(vocab_size, activation = "softmax", kernel_regularizer = regularizers.l2(LAMBDA))(relu)
######
# TODO: Add vanilla softmax at output (no weight vector here, i.e. no Dense) 
# dense2 = layers.Activation("softmax")(logit)
#####

model = Model(inputs=[story_Input, question_Input], outputs = dense2)
print(model.summary())

model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy',
               metrics=['accuracy'])
K.set_value(model.optimizer.lr, 1e-2)
print(answers_train.shape)
hist=model.fit(inps, answers_train, nb_epoch=100, batch_size=32,
           validation_data=(val_inps, answers_test))

Build model...
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
story_Input (InputLayer)        (None, 88, 8)        0                                            
__________________________________________________________________________________________________
embedding_66 (Embedding)        (None, 88, 8, 50)    6200        story_Input[0][0]                
__________________________________________________________________________________________________
question_Input (InputLayer)     (None, 5)            0                                            
__________________________________________________________________________________________________
reshape_71 (Reshape)            (None, 704, 50)      0           embedding_66[0][0]               
______________________________________________________________________________________________



Train on 1000 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100

KeyboardInterrupt: 

In [38]:
print('Build model...')


emb_dim = 20
def emb_sent_bow(inp):
    emb = layers.TimeDistributed(Embedding(vocab_size, emb_dim))(inp)
    return layers.Lambda(lambda x: K.sum(x, 2))(emb)
inp_story = layers.Input((story_maxsents, story_maxlen))
emb_story = emb_sent_bow(inp_story)
inp_story.shape, emb_story.shape
inp_q = layers.Input((query_maxlen,))
emb_q = layers.Embedding(vocab_size, emb_dim)(inp_q)
emb_q = layers.Lambda(lambda x: K.sum(x, 1))(emb_q)
emb_q = layers.Reshape((1, emb_dim))(emb_q)
inp_q.shape, emb_q.shape
x = layers.Dot(axes=2)([emb_story, emb_q])
x = layers.Reshape((story_maxsents,))(x)
x = layers.Activation('softmax')(x)
match = layers.Reshape((story_maxsents,1))(x)
match.shape
emb_c = emb_sent_bow(inp_story)
x = layers.Dot(axes=1)([match, emb_c])
response = layers.Reshape((emb_dim,))(x)
res = layers.Dense(vocab_size, activation='softmax')(response)
model = Model([inp_story, inp_q], res)
print(model.summary())
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy',
               metrics=['accuracy'])
K.set_value(model.optimizer.lr, 1e-2)
hist=model.fit(inps, answers_train, nb_epoch=100, batch_size=32,
           validation_data=(val_inps, answers_test))

print(model.summary())

Build model...
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 5)            0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 88, 8)        0                                            
__________________________________________________________________________________________________
embedding_56 (Embedding)        (None, 5, 20)        2480        input_6[0][0]                    
__________________________________________________________________________________________________
time_distributed_8 (TimeDistrib (None, 88, 8, 20)    2480        input_5[0][0]                    
______________________________________________________________________________________________



Train on 10000 samples, validate on 1000 samples
Epoch 1/100

KeyboardInterrupt: 

In [6]:
emb_dim = 40
def emb_sent_bow(inp):
    emb_op = layers.TimeDistributed(Embedding(vocab_size, emb_dim, embeddings_regularizer=regularizers.l2(0.002)))
    emb = emb_op(inp)
    emb = layers.Dropout(0.1)(emb)
    emb = layers.Lambda(lambda x: K.sum(x, 2))(emb)
#     return Elemwise(0, False)(emb), emb_op
    return emb, emb_op
inp_story = layers.Input((story_maxsents, story_maxlen))
inp_q = layers.Input((query_maxlen,))
emb_story, emb_story_op = emb_sent_bow(inp_story)
emb_q = emb_story_op.layer(inp_q)
emb_q = layers.Lambda(lambda x: K.sum(x, 1))(emb_q)
h = layers.Dense(emb_dim, kernel_regularizer=regularizers.l2(0.0000))
def one_hop(u, A):
    C, _ = emb_sent_bow(inp_story)
    x = layers.Reshape((1, emb_dim))(u)
    x = layers.Dot(axes=2)([A, x])
    x = layers.Reshape((story_maxsents,))(x)
    x = layers.Activation('softmax')(x)
    match = layers.Reshape((story_maxsents,1))(x)

    x = layers.Dot(axes=1)([match, C])
    x = layers.Reshape((emb_dim,))(x)
    x = h(x)
    #x = layers.Dropout(0.3)(x)
    x = layers.Add()([x, emb_q])
    return x, C
response, emb_story = one_hop(emb_q, emb_story)
response, emb_story = one_hop(response, emb_story)
response, emb_story = one_hop(response, emb_story)
response, emb_story = one_hop(response, emb_story)
res = layers.Dense(vocab_size, activation='softmax')(response)
answer = Model([inp_story, inp_q], res)
answer.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
               metrics=['accuracy'])
K.set_value(answer.optimizer.lr, 5e-3)
hist=answer.fit(inps, answers_train, nb_epoch=100, batch_size=32,
           validation_data=(val_inps, answers_test))



Train on 10000 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100


Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100

KeyboardInterrupt: 

In [90]:
n_hidden = 64
max_num_memories = story_maxsents
max_memory_len = story_maxlen
max_ques_len = query_maxlen

mem_input = layers.Input(shape=(max_num_memories, max_memory_len))
query_input = layers.Input(shape=(max_ques_len,))
A1 = Embedding(vocab_size,output_dim=n_hidden)
m_i = layers.TimeDistributed(A1)(mem_input)
m_i = layers.Lambda(lambda x: K.sum(x, 2))(m_i)
B = A1 #as specified
u1 = B(query_input)
u1 = (layers.Lambda(lambda x: K.sum(x, 1)))(u1)
u1 = layers.Reshape((1, n_hidden))(u1)
C1 = layers.Embedding(vocab_size,output_dim=n_hidden)
c_i_1 = layers.TimeDistributed(C1)(mem_input)
c_i_1 = layers.Lambda(lambda x: K.sum(x, 2))(c_i_1)
p1 = layers.dot([m_i, u1], axes=2)
p1 = layers.Reshape((max_num_memories,))(p1)
p1 = layers.Activation(activation='softmax')(p1)
p1 = layers.Reshape((max_num_memories,1))(p1)
o1 = layers.dot([c_i_1, p1], axes=1)
o1 = layers.Reshape(target_shape=(n_hidden,))(o1)
u1 = layers.Reshape((n_hidden,))(u1)
u2 = layers.add([o1, u1])

A2 = C1 #A(k + 1) = C(k)
m_i = layers.TimeDistributed(A2)(mem_input)
m_i = layers.Lambda(lambda x: K.sum(x, 2))(m_i)
C2 = layers.Embedding(vocab_size,output_dim=n_hidden)
c_i_2 = layers.TimeDistributed(C2)(mem_input)
c_i_2 = layers.Lambda(lambda x: K.sum(x, 2))(c_i_2)
u2 = layers.Reshape((1, n_hidden))(u2)
p2 = layers.dot([m_i, u2], axes=2)
p2 = layers.Reshape((max_num_memories,))(p2)
p2 = layers.Activation(activation='softmax')(p2)
p2 = layers.Reshape((max_num_memories,1))(p2)
o2 = layers.dot([c_i_2, p2], axes=1)
o2 = layers.Reshape(target_shape=(n_hidden,))(o2)
u2 = layers.Reshape((n_hidden,))(u2)
u1 = layers.Reshape((n_hidden,))(u1)
#u3 = add([o2, u1]) #u(k + 1) = o(k) + u(k)
#This is a hack, I was not able to get good results with u3 = u2 + o2
u3 = layers.add([o2, u1]) #u(k + 1) = o(k) + u(k)
answer = layers.Dense(vocab_size, activation='softmax')(u3)
babi2 = Model([mem_input, query_input], answer)
babi2.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
K.set_value(babi2.optimizer.lr, 5e-3)
babi2.fit(inps, answers_train, batch_size=32, epochs=100,

          validation_data=(val_inps, answers_test))

Train on 10000 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
 2112/10000 [=====>........................] - ETA: 24s - loss: 0.9584 - acc: 0.7486

KeyboardInterrupt: 