In [1]:
from __future__ import print_function
from functools import reduce
import re
import tarfile

import numpy as np
import os as os
import json

from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers, callbacks, models
from keras.layers import recurrent
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Flatten
from datetime import datetime
from keras_self_attention import SeqSelfAttention

from keras import regularizers

import keras.backend as K

import collections

Using TensorFlow backend.


In [2]:
def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]


def parse_stories(lines):
    data = []
    story = []
    for line in lines:
        line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        if int(nid) == 1: story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            substory = [[str(i)+":"]+x for i,x in enumerate(story) if x]
            data.append((substory, q, a))
            story.append('')
        else: story.append(tokenize(line))
    return data


def get_stories(f):
    data = parse_stories(f.readlines())
    return [(story, q, answer) for story, q, answer in data]


def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    X = []; Xq = []; Y = []
    for story, query, answer in data:
        x = [[word_idx[w] for w in s] for s in story]
        xq = [word_idx[w] for w in query]
        y = [word_idx[answer]]
        X.append(x); Xq.append(xq); Y.append(y)
    return ([pad_sequences(x, maxlen=story_maxlen) for x in X],
            pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))

def do_flatten(el): 
    return isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes))
def flatten(l):
    for el in l:
        if do_flatten(el): yield from flatten(el)
        else: yield el
            
def stack_inputs(inputs):
    for i,it in enumerate(inputs):
        inputs[i] = np.concatenate([it, 
                           np.zeros((story_maxsents-it.shape[0],story_maxlen), 'int')])
    return np.stack(inputs)



In [3]:
RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 128
EPOCHS = 40
MODEL_NAME = "LSTM_SelfAttention"
# Regularization parameter
LAMBDA = 0.02
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN,
                                                           EMBED_HIDDEN_SIZE,
                                                           SENT_HIDDEN_SIZE,
                                                           QUERY_HIDDEN_SIZE))

try:
    path = get_file('babi-tasks-v1-2.tar.gz',
                    origin='https://s3.amazonaws.com/text-datasets/'
                           'babi_tasks_1-20_v1-2.tar.gz')
except:
    print('Error downloading dataset, please download it manually:\n'
          '$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2'
          '.tar.gz\n'
          '$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz')
    raise




RNN / Embed / Sent / Query = <class 'keras.layers.recurrent.LSTM'>, 50, 100, 100


In [4]:
# Default QA1 with 1000 samples
#challenge = 'tasks_1-20_v1-2/en/qa1_single-supporting-fact_{}.txt'
# QA1 with 10,000 samples
# challenge = 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt'
# QA2 with 1000 samples
#challenge = 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt'
# QA2 with 10,000 samples
challenge = 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt'
with tarfile.open(path) as tar:
    train_stories = get_stories(tar.extractfile(challenge.format('train')))
    test_stories = get_stories(tar.extractfile(challenge.format('test')))
np.random.shuffle(train_stories)
np.random.shuffle(test_stories)

stories = train_stories + test_stories

story_maxlen = max((len(s) for x, _, _ in stories for s in x))
story_maxsents = max((len(x) for x, _, _ in stories))
query_maxlen = max(len(x) for _, x, _ in stories)

vocab = sorted(set(flatten(stories)))
vocab.insert(0, '<PAD>')
vocab_size = len(vocab)

word_idx = dict((c, i) for i, c in enumerate(vocab))

inputs_train, queries_train, answers_train = vectorize_stories(train_stories, 
     word_idx, story_maxlen, query_maxlen)
inputs_test, queries_test, answers_test = vectorize_stories(test_stories, 
     word_idx, story_maxlen, query_maxlen)

inputs_train = stack_inputs(inputs_train)
inputs_test = stack_inputs(inputs_test)

inps = [inputs_train, queries_train]
val_inps = [inputs_test, queries_test]


  return _compile(pattern, flags).split(string, maxsplit)


In [5]:
print('Build model...')


emb_dim = 20
def emb_sent_bow(inp):
    emb = layers.TimeDistributed(Embedding(vocab_size, emb_dim))(inp)
    return layers.Lambda(lambda x: K.sum(x, 2))(emb)
inp_story = layers.Input((story_maxsents, story_maxlen))
emb_story = emb_sent_bow(inp_story)
inp_story.shape, emb_story.shape
inp_q = layers.Input((query_maxlen,))
emb_q = layers.Embedding(vocab_size, emb_dim)(inp_q)
emb_q = layers.Lambda(lambda x: K.sum(x, 1))(emb_q)
emb_q = layers.Reshape((1, emb_dim))(emb_q)
inp_q.shape, emb_q.shape
x = layers.Dot(axes=2)([emb_story, emb_q])
x = layers.Reshape((story_maxsents,))(x)
x = layers.Activation('softmax')(x)
match = layers.Reshape((story_maxsents,1))(x)
match.shape
emb_c = emb_sent_bow(inp_story)
x = layers.Dot(axes=1)([match, emb_c])
response = layers.Reshape((emb_dim,))(x)
res = layers.Dense(vocab_size, activation='softmax')(response)
model = Model([inp_story, inp_q], res)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy',
               metrics=['accuracy'])
K.set_value(model.optimizer.lr, 1e-2)
hist=model.fit(inps, answers_train, nb_epoch=10, batch_size=32,
           validation_data=(val_inps, answers_test))

print(model.summary())


loss, acc = model.evaluate([inputs_test, queries_test], answers_test,
                           batch_size=BATCH_SIZE)

print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))

Build model...




Train on 10000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 

In [6]:
emb_dim = 30
def emb_sent_bow(inp):
    emb_op = layers.TimeDistributed(Embedding(vocab_size, emb_dim, embeddings_regularizer=regularizers.l2(0.002)))
    emb = emb_op(inp)
    emb = layers.Dropout(0.1)(emb)
    emb = layers.Lambda(lambda x: K.sum(x, 2))(emb)
#     return Elemwise(0, False)(emb), emb_op
    return emb, emb_op
inp_story = layers.Input((story_maxsents, story_maxlen))
inp_q = layers.Input((query_maxlen,))
emb_story, emb_story_op = emb_sent_bow(inp_story)
emb_q = emb_story_op.layer(inp_q)
emb_q = layers.Lambda(lambda x: K.sum(x, 1))(emb_q)
h = layers.Dense(emb_dim, kernel_regularizer=regularizers.l2(0.0000))
def one_hop(u, A):
    C, _ = emb_sent_bow(inp_story)
    x = layers.Reshape((1, emb_dim))(u)
    x = layers.Dot(axes=2)([A, x])
    x = layers.Reshape((story_maxsents,))(x)
    x = layers.Activation('softmax')(x)
    match = layers.Reshape((story_maxsents,1))(x)

    x = layers.Dot(axes=1)([match, C])
    x = layers.Reshape((emb_dim,))(x)
    x = h(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Add()([x, emb_q])
    return x, C
response, emb_story = one_hop(emb_q, emb_story)
response, emb_story = one_hop(response, emb_story)
# response, emb_story = one_hop(response, emb_story)
res = layers.Dense(vocab_size, activation='softmax')(response)
answer = Model([inp_story, inp_q], res)
answer.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
               metrics=['accuracy'])
K.set_value(answer.optimizer.lr, 5e-3)
hist=answer.fit(inps, answers_train, nb_epoch=100, batch_size=32,
           validation_data=(val_inps, answers_test))



Train on 10000 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

KeyboardInterrupt: 

In [8]:
emb_dim = 30
reg1 = 0.002
reg2 = 0.0000
dropout1 = 0.3

regv1 = [0.0000, 0.002, 0.007, 0.01, 0.015, 0.02, 0.025, 0.03, 0.1, 0.5, 0.8]
regv2 = [0.0000, 0.002, 0.007, 0.01, 0.015, 0.02, 0.025, 0.03, 0.1, 0.5, 0.8]
dropv = [0.01, 0.002, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.9, 0.001, 0.005]



def emb_sent_bow(inp, reg):
        emb_op = layers.TimeDistributed(Embedding(vocab_size, emb_dim, embeddings_regularizer=regularizers.l2(reg)))
        emb = emb_op(inp)
        emb = layers.Dropout(0.1)(emb)
        emb = layers.Lambda(lambda x: K.sum(x, 2))(emb)
        #     return Elemwise(0, False)(emb), emb_op
        return emb, emb_op

def one_hop(u, A, reg, dropout):
    C, _ = emb_sent_bow(inp_story, reg)
    x = layers.Reshape((1, emb_dim))(u)
    x = layers.Dot(axes=2)([A, x])
    x = layers.Reshape((story_maxsents,))(x)
    x = layers.Activation('softmax')(x)
    match = layers.Reshape((story_maxsents,1))(x)

    x = layers.Dot(axes=1)([match, C])
    x = layers.Reshape((emb_dim,))(x)
    x = h(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Add()([x, emb_q])
    return x, C
    
    

#hist=answer.fit(inps, answers_train, nb_epoch=1, batch_size=32, validation_split=0.05)


def build_model(reg1,reg2,dropout1):
    #K.clear_session()
    inp_story = layers.Input((story_maxsents, story_maxlen))
    inp_q = layers.Input((query_maxlen,))
    emb_story, emb_story_op = emb_sent_bow(inp_story, reg1)
    emb_q = emb_story_op.layer(inp_q)
    emb_q = layers.Lambda(lambda x: K.sum(x, 1))(emb_q)
    h = layers.Dense(emb_dim, kernel_regularizer=regularizers.l2(reg2))
    response, emb_story = one_hop(emb_q, emb_story, reg1, dropout1)
    response, emb_story = one_hop(response, emb_story, reg1, dropout1)

    
lossv = []
accv = []
#answer = {}
    
for i in range(len(regv1)):
    for j in range(len(regv2)):
        for k in range(len(dropv)):
            inp_story = layers.Input((story_maxsents, story_maxlen))
            inp_q = layers.Input((query_maxlen,))
            emb_story, emb_story_op = emb_sent_bow(inp_story, reg1)
            emb_q = emb_story_op.layer(inp_q)
            emb_q = layers.Lambda(lambda x: K.sum(x, 1))(emb_q)
            h = layers.Dense(emb_dim, kernel_regularizer=regularizers.l2(0.0000))

            response, emb_story = one_hop(emb_q, emb_story, reg1, dropout1)
            response, emb_story = one_hop(response, emb_story, reg1, dropout1)
            # response, emb_story = one_hop(response, emb_story)
            res = layers.Dense(vocab_size, activation='softmax')(response)
            answer = Model([inp_story, inp_q], res)
            answer.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
                           metrics=['accuracy'])
            K.set_value(answer.optimizer.lr, 5e-3)

            build_model(regv1[i],regv2[i],dropv[i])

            res = layers.Dense(vocab_size, activation='softmax')(response)
            answer = Model([inp_story, inp_q], res)
            answer.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
                           metrics=['accuracy'])
            K.set_value(answer.optimizer.lr, 5e-3)
            hist=answer.fit(inps, answers_train, nb_epoch=2, batch_size=32,
                       validation_split=0.05)

            loss, acc = answer.evaluate([inputs_test, queries_test], answers_test,
                               batch_size=BATCH_SIZE)

            #lossv.append(loss)
            #accv.append(acc)


            print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))
            
            prstr = 'Reg1 = ' + str(regv1[i]) + ' Reg2 = ' + str(regv2[j]) + ' Dropout = ' + str(dropv[k])
            print(prstr)
            #print('Reg1 = {:.4f}, Reg2= {:.4f}, Dropout = {:,4f}' .format(regv1[i],regv2[j],dropv[k]))


            K.clear_session()


#print('loss vector = {:.4f}'.format(lossv))
#print('loss vector = {:.4f}'.format(acc))



Train on 9500 samples, validate on 500 samples
Epoch 1/2
Epoch 2/2
Test loss / test accuracy = 1.8107 / 0.1600
Reg1 = 0.0 Reg2 = 0.0 Dropout = 0.01
Train on 9500 samples, validate on 500 samples
Epoch 1/2
1408/9500 [===>..........................] - ETA: 29s - loss: 2.3879 - acc: 0.1612

KeyboardInterrupt: 