In [1]:
from __future__ import print_function
from functools import reduce
import re
import tarfile

import numpy as np
import os as os
import json

from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers, callbacks, models
from keras.layers import recurrent
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Flatten
from datetime import datetime
from keras_self_attention import SeqSelfAttention

from keras import regularizers

import keras.backend as K

import collections

Using TensorFlow backend.


In [2]:
def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]


def parse_stories(lines):
    data = []
    story = []
    for line in lines:
        line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        if int(nid) == 1: story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            substory = [[str(i)+":"]+x for i,x in enumerate(story) if x]
            data.append((substory, q, a))
            story.append('')
        else: story.append(tokenize(line))
    return data


def get_stories(f):
    data = parse_stories(f.readlines())
    return [(story, q, answer) for story, q, answer in data]


def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    X = []; Xq = []; Y = []
    for story, query, answer in data:
        x = [[word_idx[w] for w in s] for s in story]
        xq = [word_idx[w] for w in query]
        y = [word_idx[answer]]
        X.append(x); Xq.append(xq); Y.append(y)
    return ([pad_sequences(x, maxlen=story_maxlen) for x in X],
            pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))

def do_flatten(el): 
    return isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes))
def flatten(l):
    for el in l:
        if do_flatten(el): yield from flatten(el)
        else: yield el
            
def stack_inputs(inputs):
    for i,it in enumerate(inputs):
        inputs[i] = np.concatenate([it, 
                           np.zeros((story_maxsents-it.shape[0],story_maxlen), 'int')])
    return np.stack(inputs)



In [3]:
RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 128
EPOCHS = 40
MODEL_NAME = "LSTM_SelfAttention"
# Regularization parameter
LAMBDA = 0.02
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN,
                                                           EMBED_HIDDEN_SIZE,
                                                           SENT_HIDDEN_SIZE,
                                                           QUERY_HIDDEN_SIZE))

try:
    path = get_file('babi-tasks-v1-2.tar.gz',
                    origin='https://s3.amazonaws.com/text-datasets/'
                           'babi_tasks_1-20_v1-2.tar.gz')
except:
    print('Error downloading dataset, please download it manually:\n'
          '$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2'
          '.tar.gz\n'
          '$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz')
    raise




RNN / Embed / Sent / Query = <class 'keras.layers.recurrent.LSTM'>, 50, 100, 100


In [4]:
challenge = []
challenge.append('tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa3_three-supporting-facts_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa4_two-arg-relations_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa5_three-arg-relations_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa6_yes-no-questions_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa7_counting_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa8_lists-sets_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa9_simple-negation_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa10_indefinite-knowledge_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa11_basic-coreference_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa12_conjunction-fact_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa13_compound-coreference_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa14_time-reasoning_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa15_basic-deduction_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa16_basic-induction_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa17_positional-reasoning_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa18_size-reasoning_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa19_path-finding_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa20_agents-motivations_{}.txt')
# Default QA1 with 1000 samples
#challenge = 'tasks_1-20_v1-2/en/qa1_single-supporting-fact_{}.txt'
# QA1 with 10,000 samples
# challenge = 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt'
# QA2 with 1000 samples
#challenge = 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt'
# QA2 with 10,000 samples
#challenge = 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt'
#challenge = 'tasks_1-20_v1-2/en-10k/qa3_three-supporting-facts_{}.txt'


#with tarfile.open(path) as tar:
    #train_stories = get_stories(tar.extractfile(challenge[1].format('train')))
    #test_stories = get_stories(tar.extractfile(challenge[1].format('test')))
#np.random.shuffle(train_stories)
#np.random.shuffle(test_stories)

#stories = train_stories + test_stories

#story_maxlen = max((len(s) for x, _, _ in stories for s in x))
#story_maxsents = max((len(x) for x, _, _ in stories))
#query_maxlen = max(len(x) for _, x, _ in stories)

#vocab = sorted(set(flatten(stories)))
#vocab.insert(0, '<PAD>')
#vocab_size = len(vocab)

#word_idx = dict((c, i) for i, c in enumerate(vocab))

#inputs_train, queries_train, answers_train = vectorize_stories(train_stories, 
     #word_idx, story_maxlen, query_maxlen)
#inputs_test, queries_test, answers_test = vectorize_stories(test_stories, 
     #word_idx, story_maxlen, query_maxlen)

#inputs_train = stack_inputs(inputs_train)
#inputs_test = stack_inputs(inputs_test)

#inps = [inputs_train, queries_train]
#val_inps = [inputs_test, queries_test]


In [5]:
emb_dim = 30
reg1 = 0.002
reg2 = 0.0000
dropout1 = 0.3


#regv1 = [0.01]
#regv2 = [0.01]
#dropv = [0.01]

regv1 = [0.000, 0.01, 0.05, 0.1]
regv2 = [0.000, 0.01, 0.05, 0.1]
dropv = [0.000, 0.05, 0.1, 0.4]
learn_rate = [5e-3]


def emb_sent_bow(inp, reg):
        emb_op = layers.TimeDistributed(Embedding(vocab_size, emb_dim, embeddings_regularizer=regularizers.l2(reg)))
        emb = emb_op(inp)
        emb = layers.Dropout(0.1)(emb)
        emb = layers.Lambda(lambda x: K.sum(x, 2))(emb)
        #     return Elemwise(0, False)(emb), emb_op
        return emb, emb_op

def one_hop(u, A, reg, dropout):
    C, _ = emb_sent_bow(inp_story, reg)
    x = layers.Reshape((1, emb_dim))(u)
    x = layers.Dot(axes=2)([A, x])
    x = layers.Reshape((story_maxsents,))(x)
    x = layers.Activation('softmax')(x)
    match = layers.Reshape((story_maxsents,1))(x)

    x = layers.Dot(axes=1)([match, C])
    x = layers.Reshape((emb_dim,))(x)
    x = h(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Add()([x, emb_q])
    return x, C
    
    

#hist=answer.fit(inps, answers_train, nb_epoch=1, batch_size=32, validation_split=0.05)


def build_model(reg1,reg2,dropout1):
    
    
    
    #K.clear_session()
    inp_story = layers.Input((story_maxsents, story_maxlen))
    inp_q = layers.Input((query_maxlen,))
    emb_story, emb_story_op = emb_sent_bow(inp_story, reg1)
    emb_q = emb_story_op.layer(inp_q)
    emb_q = layers.Lambda(lambda x: K.sum(x, 1))(emb_q)
    h = layers.Dense(emb_dim, kernel_regularizer=regularizers.l2(reg2))
    response, emb_story = one_hop(emb_q, emb_story, reg1, dropout1)
    response, emb_story = one_hop(response, emb_story, reg1, dropout1)

    



#answer = {}
    
f = open("hp_results.txt", "w+")

#j = 0
#k = 0

for c in range(len(challenge)):

    with tarfile.open(path) as tar:
        train_stories = get_stories(tar.extractfile(challenge[c].format('train')))
        test_stories = get_stories(tar.extractfile(challenge[c].format('test')))
    np.random.shuffle(train_stories)
    np.random.shuffle(test_stories)

    stories = train_stories + test_stories

    story_maxlen = max((len(s) for x, _, _ in stories for s in x))
    story_maxsents = max((len(x) for x, _, _ in stories))
    query_maxlen = max(len(x) for _, x, _ in stories)

    vocab = sorted(set(flatten(stories)))
    vocab.insert(0, '<PAD>')
    vocab_size = len(vocab)

    word_idx = dict((c, i) for i, c in enumerate(vocab))

    inputs_train, queries_train, answers_train = vectorize_stories(train_stories, 
         word_idx, story_maxlen, query_maxlen)
    inputs_test, queries_test, answers_test = vectorize_stories(test_stories, 
         word_idx, story_maxlen, query_maxlen)

    inputs_train = stack_inputs(inputs_train)
    inputs_test = stack_inputs(inputs_test)

    inps = [inputs_train, queries_train]
    val_inps = [inputs_test, queries_test]

    best_acc = 0
    best_loss = 0
    best_reg1 = 0
    best_reg2 = 0
    best_drop = 0
    best_lr = 0

    for i in range(len(regv1)):
        #for j in range(len(regv2)):
        for k in range(len(dropv)):
            for l in range(len(learn_rate)):
                inp_story = layers.Input((story_maxsents, story_maxlen))
                inp_q = layers.Input((query_maxlen,))
                emb_story, emb_story_op = emb_sent_bow(inp_story, regv1[i])
                emb_q = emb_story_op.layer(inp_q)
                emb_q = layers.Lambda(lambda x: K.sum(x, 1))(emb_q)
                h = layers.Dense(emb_dim, kernel_regularizer=regularizers.l2(regv2[i]))

                response, emb_story = one_hop(emb_q, emb_story, reg1, dropout1)
                response, emb_story = one_hop(response, emb_story, reg1, dropout1)
                # response, emb_story = one_hop(response, emb_story)
                res = layers.Dense(vocab_size, activation='softmax')(response)
                answer = Model([inp_story, inp_q], res)
                answer.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
                               metrics=['accuracy'])
                K.set_value(answer.optimizer.lr, learn_rate[l])

                build_model(regv1[i],regv2[i],dropv[i])

                res = layers.Dense(vocab_size, activation='softmax')(response)
                answer = Model([inp_story, inp_q], res)
                answer.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
                               metrics=['accuracy'])
                K.set_value(answer.optimizer.lr, learn_rate[l])
                hist=answer.fit(inps, answers_train, nb_epoch=20, batch_size=32,
                           validation_split=0.05)

                loss, acc = answer.evaluate([inputs_test, queries_test], answers_test,
                                   batch_size=BATCH_SIZE)

                #loss.append(loss)
                #accv.append(acc)

                print(challenge[c])
                print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))
                prstr = 'Reg1 = ' + str(regv1[i]) + ' Reg2 = ' + str(regv2[i]) + ' Dropout = ' + str(dropv[k]) + ' Learn Rate = ' + str(learn_rate[l])
                print(prstr)



                if acc > best_acc:
                    best_acc = acc
                    best_loss = loss
                    best_reg1 = regv1[i]
                    best_reg2 = regv2[i] 
                    best_drop = dropv[k]
                    best_lr = learn_rate[l]

                    filestr = prstr + '\n' + 'Test loss = ' + str(round(loss,4)) + ' Test accuracy = ' + str(round(acc,4)) + '\n'
                    f.write(filestr)

                if acc == 1:
                    break

                #prstr = 'Best Acc = ' + str(best_acc) + ' Best Reg1 = ' + str(best_reg1) + ' Best Reg2 = ' + str(best_reg2) + ' Best Drop = ' + str(best_drop) + ' Best Learn Rate = ' + str(learn_rate[l])
                #print(prstr)

                K.clear_session()
            if acc == 1:
                break
        if acc == 1:
            break

    final_filestr = '\n\n\n---------------------------------------------- QA'+str(c+1)+' ----------------------------------------------\n'
    final_filestr = final_filestr + '-------------------------------------------- Results ----------------------------------------------\n\n'
    final_filestr = final_filestr + 'Best Acc = ' + str(round(best_acc,4)) + ' Best Loss = ' + str(round(best_loss,4)) + ' Best Reg1 = ' + str(best_reg1) + ' Best Reg2 = ' + str(best_reg2) + ' Best Drop = ' + str(best_drop) + ' Best Learn Rate = ' + str(learn_rate[l]) + '\n\n'
    final_filestr = final_filestr + '-------------------------------------------- Results ----------------------------------------------\n'
    final_filestr = final_filestr + '---------------------------------------------- QA'+str(c+1)+' ----------------------------------------------\n\n\n'


    f.write(final_filestr)

#beststr = 'Best Results:\n'
#beststr = beststr + 'Reg1 = ' + str(best_reg1) + ' Reg2 = ' + str(best_reg2) + ' Dropout = ' + str(best_drop) + ' Learn Rate = ' + str(best_lr) + '\n' + ' Test loss = ' + str(best_loss) + ' Test accuracy = ' + str(best_acc)
#f.write(beststr)

f.close()

  return _compile(pattern, flags).split(string, maxsplit)


Train on 9500 samples, validate on 500 samples
Epoch 1/20
1696/9500 [====>.........................] - ETA: 9s - loss: 2.1007 - acc: 0.1692 

KeyboardInterrupt: 