In [1]:
from __future__ import print_function
from functools import reduce
import re
import tarfile

import numpy as np
import os as os
import json

from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers, callbacks, models
from keras.layers import recurrent
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Flatten
from datetime import datetime
from keras_self_attention import SeqSelfAttention

from keras import regularizers

import keras.backend as K

import collections

Using TensorFlow backend.


In [2]:
RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 100
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 128
EPOCHS = 40
MODEL_NAME = "LSTM_SelfAttention"
# Regularization parameter
LAMBDA = 0.02
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN,
                                                           EMBED_HIDDEN_SIZE,
                                                           SENT_HIDDEN_SIZE,
                                                           QUERY_HIDDEN_SIZE))

path = get_file('babi-tasks-v1-2.tar.gz',
                    origin='https://s3.amazonaws.com/text-datasets/'
                           'babi_tasks_1-20_v1-2.tar.gz')



RNN / Embed / Sent / Query = <class 'keras.layers.recurrent.LSTM'>, 100, 100, 100


In [3]:
challenge = []
challenge.append('tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa3_three-supporting-facts_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa4_two-arg-relations_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa5_three-arg-relations_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa6_yes-no-questions_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa7_counting_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa8_lists-sets_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa9_simple-negation_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa10_indefinite-knowledge_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa11_basic-coreference_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa12_conjunction-fact_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa13_compound-coreference_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa14_time-reasoning_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa15_basic-deduction_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa16_basic-induction_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa17_positional-reasoning_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa18_size-reasoning_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa19_path-finding_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa20_agents-motivations_{}.txt')


def extract_stories(text):
    story_out = []
    new_story = []
    for line in text.readlines():
        line = line.decode('utf-8').strip()
        number, line = line.split(' ', 1)
        if int(number) == 1: 
            new_story = []
        if '\t' in line:
            question, answer, _ = line.split('\t')
            question = re.findall(r"[\w']+|[.,!?]", question)
            passage = []
            for i,j in enumerate(new_story):
                if j:
                    passage.append([str(i)+":"]+j)
            story_out.append((passage, question, answer))
        else: 
            new_story.append(re.findall(r"[\w']+|[.,!?]", line))
    return story_out
    
with tarfile.open(path) as tar:
    train_stories = extract_stories(tar.extractfile(challenge[0].format('train')))
    test_stories = extract_stories(tar.extractfile(challenge[0].format('test')))
np.random.shuffle(train_stories)
np.random.shuffle(test_stories)

stories = train_stories + test_stories

In [4]:
story_maxlen = max((len(s) for x, _, _ in stories for s in x))
story_maxsents = max((len(x) for x, _, _ in stories))
query_maxlen = max(len(x) for _, x, _ in stories)


def flatten(text):
    for i in text:
        if (isinstance(i, collections.Iterable) and not isinstance(i,(str, bytes))): 
            yield from flatten(i)
        else: 
            yield i

vocab = sorted(set(flatten(stories)))
vocab.insert(0, '<PAD>')
vocab_size = len(vocab)
print(vocab_size)
print(story_maxlen)
print(story_maxsents)
print(query_maxlen)

word_idx = dict((c, i) for i, c in enumerate(vocab))

def sort_corpus(data, word_idx):
    passage_vect = []
    question_vect = []
    answer_vect = []
    for corpus in data:
        passage = corpus[0]
        question = corpus[1]
        answer = corpus[2]
        passage_num = []
        for lines in passage:
            passage_num.append([word_idx[words] for words in lines])
        passage_vect.append(passage_num)
        question_vect.append([word_idx[words] for words in question]) 
        answer_vect.append([word_idx[answer]])
        
    return(passage_vect, question_vect, answer_vect)


passage, question, answer = sort_corpus(train_stories, 
     word_idx)

#padding train
inputs_train = []
for i in passage:
    inputs_train.append(pad_sequences(i,maxlen=story_maxlen))
queries_train = pad_sequences(question, maxlen=query_maxlen)
answers_train = np.array(answer)


passage, question, answer = sort_corpus(test_stories, 
     word_idx)

#padding test
inputs_test = []
for i in passage:
    inputs_test.append(pad_sequences(i,maxlen=story_maxlen))
queries_test = pad_sequences(question, maxlen=query_maxlen)
answers_test = np.array(answer)


inputs_stack = []
for i,j in enumerate(inputs_train):
    inputs_stack.append(np.concatenate([j,np.zeros((story_maxsents-j.shape[0],story_maxlen),'int')]))
inputs_train = np.stack(inputs_stack)

inputs_stack = []
for i,j in enumerate(inputs_test):
    inputs_stack.append(np.concatenate([j,np.zeros((story_maxsents-j.shape[0],story_maxlen),'int')]))
inputs_test = np.stack(inputs_stack)

inps = [inputs_train, queries_train]
val_inps = [inputs_test, queries_test]


32
8
10
4


In [5]:
emb_dim = 40
def emb_sent_bow(inp):
    emb_op = layers.TimeDistributed(Embedding(vocab_size, emb_dim, embeddings_regularizer=regularizers.l2(0.002)))
    emb = emb_op(inp)
    emb = layers.Dropout(0.1)(emb)
    emb = layers.Lambda(lambda x: K.sum(x, 2))(emb)
#     return Elemwise(0, False)(emb), emb_op
    return emb, emb_op
inp_story = layers.Input((story_maxsents, story_maxlen))
inp_q = layers.Input((query_maxlen,))
emb_story, emb_story_op = emb_sent_bow(inp_story)
emb_q = emb_story_op.layer(inp_q)
emb_q = layers.Lambda(lambda x: K.sum(x, 1))(emb_q)
h = layers.Dense(emb_dim, kernel_regularizer=regularizers.l2(0.0000))
def one_hop(u, A):
    C, _ = emb_sent_bow(inp_story)
    x = layers.Reshape((1, emb_dim))(u)
    x = layers.Dot(axes=2)([A, x])
    x = layers.Reshape((story_maxsents,))(x)
    x = layers.Activation('softmax')(x)
    match = layers.Reshape((story_maxsents,1))(x)

    x = layers.Dot(axes=1)([match, C])
    x = layers.Reshape((emb_dim,))(x)
    x = h(x)
    #x = layers.Dropout(0.3)(x)
    x = layers.Add()([x, emb_q])
    return x, C
response, emb_story = one_hop(emb_q, emb_story)
response, emb_story = one_hop(response, emb_story)
response, emb_story = one_hop(response, emb_story)
response, emb_story = one_hop(response, emb_story)
res = layers.Dense(vocab_size, activation='softmax')(response)
answer = Model([inp_story, inp_q], res)
answer.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
               metrics=['accuracy'])
K.set_value(answer.optimizer.lr, 5e-3)
hist=answer.fit(inps, answers_train, nb_epoch=100, batch_size=32,
           validation_data=(val_inps, answers_test))



Train on 10000 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100

KeyboardInterrupt: 