In [1]:
from __future__ import print_function
from functools import reduce
import re
import tarfile

import numpy as np
import os as os
import json

from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers, callbacks, models
from keras.layers import recurrent
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Flatten
from datetime import datetime

from keras import regularizers

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

Using TensorFlow backend.


In [2]:
RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 128
EPOCHS = 40
MODEL_NAME = "Baseline"
# Regularization parameter
LAMBDA = 0.01
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN,
                                                           EMBED_HIDDEN_SIZE,
                                                           SENT_HIDDEN_SIZE,
                                                           QUERY_HIDDEN_SIZE))

path = get_file('babi-tasks-v1-2.tar.gz',
                    origin='https://s3.amazonaws.com/text-datasets/'
                           'babi_tasks_1-20_v1-2.tar.gz')


RNN / Embed / Sent / Query = <class 'keras.layers.recurrent.LSTM'>, 50, 100, 100


In [3]:
challenge = []
challenge.append('tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa3_three-supporting-facts_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa4_two-arg-relations_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa5_three-arg-relations_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa6_yes-no-questions_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa7_counting_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa8_lists-sets_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa9_simple-negation_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa10_indefinite-knowledge_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa11_basic-coreference_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa12_conjunction-fact_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa13_compound-coreference_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa14_time-reasoning_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa15_basic-deduction_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa16_basic-induction_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa17_positional-reasoning_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa18_size-reasoning_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa19_path-finding_{}.txt')
challenge.append('tasks_1-20_v1-2/en-10k/qa20_agents-motivations_{}.txt')


def extract_stories(text):
    story_out = []
    new_story = []
    for line in text.readlines():
        line = line.decode('utf-8').strip()
        number, line = line.split(' ', 1)
        if int(number) == 1: 
            new_story = []
        if '\t' in line:
            question, answer, _ = line.split('\t')
            question = re.findall(r"[\w']+|[.,!?]", question)
            passage = []
            for i,j in enumerate(new_story):
                if j:
                    passage.append([str(i)+":"]+j)
            story_out.append((passage, question, answer))
        else: 
            new_story.append(re.findall(r"[\w']+|[.,!?]", line))
    
    flatten = lambda story_out: reduce(lambda x, y: x + y, story_out)
    story_out = [(flatten(p), q, a) for p, q, a in story_out
            if not None or len(flatten(story)) < None]
    return story_out

with tarfile.open(path) as tar:
    train = extract_stories(tar.extractfile(challenge[0].format('train')))
    test = extract_stories(tar.extractfile(challenge[0].format('test')))
np.random.shuffle(train)
np.random.shuffle(test)
vocab = set()
for story, q, answer in train + test:
    vocab |= set(story + q + [answer])
vocab = sorted(vocab)

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
story_maxlen = max(map(len, (x for x, _, _ in train + test)))
query_maxlen = max(map(len, (x for _, x, _ in train + test)))


def sort_corpus(data, word_idx):
    passage_vect = []
    question_vect = []
    answer_vect = []
    for corpus in data:
        passage = corpus[0]
        question = corpus[1]
        answer = corpus[2]
        
        passage_num = []
        for lines in passage:
            passage_num.append(word_idx[lines])
        passage_vect.append(passage_num)
        question_num = []
        for words in question:
            question_num.append(word_idx[words])
        question_vect.append(question_num)
        answer_num = np.zeros(len(word_idx)+1)
        answer_num[word_idx[answer]] = 1
        answer_vect.append(answer_num)
    return(passage_vect, question_vect, answer_vect)


passage, question, answer = sort_corpus(train, word_idx)

x = pad_sequences(passage, maxlen=story_maxlen)
xq = pad_sequences(question, maxlen=query_maxlen)
y = np.array(answer)


passage, question, answer = sort_corpus(test, word_idx)

tx = pad_sequences(passage, maxlen=story_maxlen)
txq = pad_sequences(question, maxlen=query_maxlen)
ty = np.array(answer)

print('vocab = {}'.format(vocab))
print('x.shape = {}'.format(x.shape))
print('xq.shape = {}'.format(xq.shape))
print('y.shape = {}'.format(y.shape))
print('story_maxlen, query_maxlen = {}, {}'.format(story_maxlen, query_maxlen))


vocab = ['.', '0:', '1:', '2:', '3:', '4:', '5:', '6:', '7:', '8:', '9:', '?', 'Daniel', 'John', 'Mary', 'Sandra', 'Where', 'back', 'bathroom', 'bedroom', 'garden', 'hallway', 'is', 'journeyed', 'kitchen', 'moved', 'office', 'the', 'to', 'travelled', 'went']
x.shape = (10000, 78)
xq.shape = (10000, 4)
y.shape = (10000, 32)
story_maxlen, query_maxlen = 78, 4


In [4]:
print("Building the embedding matrix...")
GLOVE_PATH = '../../glove.6B'

#MAC
f = open(os.path.join(GLOVE_PATH,"glove.6B.{}d.txt".format(EMBED_HIDDEN_SIZE)), 'r', encoding = "ISO-8859-1")
#WINDOWS
#f = open(os.path.join(GLOVE_PATH,"glove.6B.{}d.txt".format(EMBED_HIDDEN_SIZE)), 'r', encoding = "ANSI")
embeddings_index = {}
for line in f:
    values = line.split(" ")
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except ValueError:
        print(values[1:])
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((len(word_idx) + 1, EMBED_HIDDEN_SIZE))
for word, i in word_idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Found %s word vectors.' % len(embeddings_index))

Building the embedding matrix...
Found 400000 word vectors.


In [5]:
print('Build model...')



question_Input = layers.Input(shape=xq[0].shape, name='question_Input')
story_Input = layers.Input(shape=x[0].shape, name='story_Input')

#Embed question
q_Embedding = layers.Embedding(input_dim = vocab_size, output_dim = EMBED_HIDDEN_SIZE, \
                               weights = [embedding_matrix], input_length = query_maxlen)(question_Input)
# Bidirectional GRU (optimal dropout approx 0.4 without regularization)
q_Encode = layers.Bidirectional(recurrent.GRU(EMBED_HIDDEN_SIZE, return_sequences=True,\
                                              kernel_regularizer = regularizers.l2(LAMBDA), dropout=0.3))(q_Embedding)
q_Encode = layers.Reshape((query_maxlen, 2*EMBED_HIDDEN_SIZE))(q_Encode)

#Embed story
s_Embedding = layers.Embedding(input_dim = vocab_size, output_dim = EMBED_HIDDEN_SIZE, \
                               weights = [embedding_matrix], input_length = story_maxlen)(story_Input)
# Bidirectional GRU (optimal dropout approx 0.4 without regularization)
s_Encode = layers.Bidirectional(recurrent.GRU(EMBED_HIDDEN_SIZE, return_sequences=True, \
                                              kernel_regularizer = regularizers.l2(LAMBDA), dropout=0.3))(s_Embedding)
s_Encode = layers.Reshape((story_maxlen, 2*EMBED_HIDDEN_SIZE))(s_Encode)

# Attention Layer
# Multiply between context and query to form attention
# Resultant matrix should be MxN, taking in Mxd and Nxd 
# embedded question/answer matrices where d is 2*EMBED_HIDDEN_SIZE
dot_merge = layers.Dot(axes = [2,2])([s_Encode, q_Encode])

# Flatten and compute softmax for each attent distro
flat = Flatten()(dot_merge)
dense = layers.Dense(query_maxlen * story_maxlen, kernel_regularizer = regularizers.l2(LAMBDA))(flat)
# act = layers.Activation("softmax")(dense)
act = layers.Activation("softmax")(dot_merge)


# Reshape back into the original dimensions (MxN)
act_resh = layers.Reshape((story_maxlen, query_maxlen), input_shape=(query_maxlen,))(act)
# Compute attention output as an element-wise multiplication
attn_out = layers.Dot(axes=[2,1])([act_resh, q_Encode])
# Next we concatenate to form a blended representation of the same dimension as an encoded question,
# of which there exists one for every given context hidden state. Should be 4H x 2N
blended = layers.Concatenate(axis=2)([s_Encode, attn_out])
flat2 = Flatten()(blended)
#relu = layers.Activation("relu")(flat2)

#####
# TODO: Finish Logit + fully connected layer for RELU 
relu = layers.Dense(EMBED_HIDDEN_SIZE, activation = "relu")(flat2)
#logit = layers.Dense(1)(relu)
#####
#print(relu.get_shape(), logit.get_shape())


dense2 = layers.Dense(vocab_size, activation = "softmax", kernel_regularizer = regularizers.l2(LAMBDA))(relu)
######
# TODO: Add vanilla softmax at output (no weight vector here, i.e. no Dense) 
# dense2 = layers.Activation("softmax")(logit)
#####

model = Model(inputs=[story_Input, question_Input], outputs = [dense2])
print(model.summary())

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


Build model...
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
story_Input (InputLayer)        (None, 78)           0                                            
__________________________________________________________________________________________________
question_Input (InputLayer)     (None, 4)            0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 78, 50)       1600        story_Input[0][0]                
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 4, 50)        1600        question_Input[0][0]             
______________________________________________________________________________________________

In [6]:
print('Training')
print(model.summary())
now = datetime.now()
currDate = "{}-{}-{}".format(now.month, now.day, now.year)

outpath = "../Outputs"
modelFilename = "{}_{}.h5".format(MODEL_NAME, currDate)
histFilename = "{}_history_{}.json".format(MODEL_NAME, currDate)


saverCallback = callbacks.ModelCheckpoint(filepath = os.path.join(outpath, modelFilename), monitor = "val_loss", verbose = 1)
history = model.fit([x, xq], y,
          batch_size=BATCH_SIZE,
          epochs=100,
          validation_split=0.05,
          callbacks = [saverCallback])
loss, acc = model.evaluate([tx, txq], ty,
                           batch_size=BATCH_SIZE)

with open(os.path.join(outpath, histFilename), 'w') as histFile:
    json.dump(history.history, histFile)
    
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))

Training
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
story_Input (InputLayer)        (None, 78)           0                                            
__________________________________________________________________________________________________
question_Input (InputLayer)     (None, 4)            0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 78, 50)       1600        story_Input[0][0]                
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 4, 50)        1600        question_Input[0][0]             
__________________________________________________________________________________________________
b

KeyboardInterrupt: 

In [None]:
import os, re
import json
import matplotlib.pyplot as plt
import seaborn as sns
%pylab inline

In [None]:
outpath = "../Outputs"
hist = json.load(open(os.path.join(outpath, "Baseline_history_11-17-2018.json"), "r"))

xlen = len(hist[list(hist.keys())[0]])
print(xlen)
x_pts = [i for i in range(xlen)]

fig, (ax1, ax2) = plt.subplots(ncols=2, sharey = False)
fig.set_figheight(4)
fig.set_figwidth(15)
sns.lineplot(x=x_pts, y=hist["loss"], ax = ax1).set_title("loss")
sns.lineplot(x=x_pts, y=hist["acc"], ax = ax2).set_title("acc")
fig.savefig(os.path.join(outpath, "{}_train_metrics.png".format(MODEL_NAME)))

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey = False)
fig.set_figheight(4)
fig.set_figwidth(15)
sns.lineplot(x=x_pts, y=hist["val_loss"], ax = ax1).set_title("val_loss")
sns.lineplot(x=x_pts, y=hist["val_acc"], ax = ax2).set_title("val_acc")
fig.savefig(os.path.join(outpath, "{}_val_metrics.png".format(MODEL_NAME)))