In [1]:
from __future__ import print_function
from functools import reduce
import re
import tarfile

import numpy as np

from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import recurrent
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    xs = []
    xqs = []
    ys = []
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        xq = [word_idx[w] for w in query]
        # let's not forget that index 0 is reserved
        #y = np.zeros(len(word_idx) + 1)
        #y[word_idx[answer]] = 1
        y = np.zeros((2, story_maxlen+3))
        if(answer == ['yes']):
            y[0][story_maxlen] = 1
            y[1][story_maxlen] = 1
        elif(answer == ['no']):
            y[0][story_maxlen+1] = 1
            y[1][story_maxlen+1] = 1
        elif(answer == ['null']):
            y[0][story_maxlen+2] = 1
            y[1][story_maxlen+2] = 1
        else:
            y[0][answer[0]] = 1
            y[1][answer[1]] = 1
        xs.append(x)
        xqs.append(xq)
        ys.append(y)
    return (pad_sequences(xs, maxlen=story_maxlen),
            pad_sequences(xqs, maxlen=query_maxlen), np.array(ys))

RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 32
EPOCHS = 5
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN,
                                                           EMBED_HIDDEN_SIZE,
                                                           SENT_HIDDEN_SIZE,
                                                           QUERY_HIDDEN_SIZE))

#Read in CMU Question Answer Dataset
print('Reading in CMU dataset....')
print('Reading in corpus...')
corpus = {}
for sets in range(1, 5):
    for a in range(1, 11):
        f = open("..//Data//Question_Answer_Dataset_v1.2//S08//data//set{}//a{}.txt.clean".format(sets, a), 'r', encoding="ANSI")
        content = f.read()
        f.close()
        content = content.replace('\n', ' ').replace('\r', '')
        content = re.sub(r'[^a-zA-Z ]', '', content).lower().split()
        corpus["data/set{}/a{}".format(sets, a)] = content
#print(corpus['data/set1/a1'])

print('Reading in questions...')
f = open("..//Data//Question_Answer_Dataset_v1.2//S08//question_answer_pairs.txt", 'r', encoding="ANSI")
content = f.readlines()
f.close()
content = [x.strip() for x in content]
content = [re.split(r'\t+', x) for x in content]
questions = []
for line in content[1:]:
    if len(line) > 5:
        article = corpus[line[5]]
        q = re.sub(r'[^a-zA-Z ]', '', line[1]).lower().split()
        ans = re.sub(r'[^a-zA-Z ]', '', line[2]).lower().split()
        y = [-1, -1]
        #print(ans)
        for i in range(len(article)-len(ans)):
            #print(article[i:i+len(ans)])
            if(article[i:i+len(ans)] == ans):
                y = [i, i+len(ans)]
                #print('Found one!')
        if(y == [-1, -1]):
            if(ans[0].lower() == 'yes'):
                y = ['yes']
            if(ans[0].lower() == 'no'):
                y = ['no']
            if(ans[0].lower() == 'null'):
                y = ['null']
                
        questions.append((article, q, y))
np.random.shuffle(questions)
train, test = questions[:1020], questions[1020:]


vocab = set()
for story, q, answer in train + test:
    words = story + q
    vocab |= set(words)
vocab = sorted(vocab)

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
story_maxlen = max(map(len, (x for x, _, _ in train + test)))
query_maxlen = max(map(len, (x for _, x, _ in train + test)))

x, xq, y = vectorize_stories(train, word_idx, story_maxlen, query_maxlen)
tx, txq, ty = vectorize_stories(test, word_idx, story_maxlen, query_maxlen)



print('vocab = {}'.format(vocab))
print('x.shape = {}'.format(x.shape))
print('xq.shape = {}'.format(xq.shape))
print('y.shape = {}'.format(y.shape))
print('story_maxlen, query_maxlen = {}, {}'.format(story_maxlen, query_maxlen))





RNN / Embed / Sent / Query = <class 'keras.layers.recurrent.LSTM'>, 50, 100, 100
Reading in CMU dataset....
Reading in corpus...
Reading in questions...
x.shape = (1020, 11728)
xq.shape = (1020, 99)
y.shape = (1020, 2, 11731)
story_maxlen, query_maxlen = 11728, 99


In [48]:
xq.shape

(1020, 99)

In [None]:
print('Build model...')

sentence = layers.Input(shape=(story_maxlen,), dtype='int32')
encoded_sentence = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(sentence)
encoded_sentence = layers.Dropout(0.3)(encoded_sentence)

question = layers.Input(shape=(query_maxlen,), dtype='int32')
encoded_question = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(question)
encoded_question = layers.Dropout(0.3)(encoded_question)
encoded_question = RNN(EMBED_HIDDEN_SIZE)(encoded_question)
encoded_question = layers.RepeatVector(story_maxlen)(encoded_question)

merged = layers.add([encoded_sentence, encoded_question])
merged = RNN(EMBED_HIDDEN_SIZE)(merged)
merged = layers.Dropout(0.3)(merged)
preds_beg = layers.Dense(story_maxlen+3, activation='softmax')(merged)

merged = layers.add([encoded_sentence, encoded_question])
merged = RNN(EMBED_HIDDEN_SIZE)(merged)
merged = layers.Dropout(0.3)(merged)
preds_end = layers.Dense(story_maxlen+3, activation='softmax')(merged)



model = Model([sentence, question], [preds_beg, preds_end])
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

print('Training')
model.fit([x, xq], [y[:, 0], y[:, 1]],
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.05, verbose=1)


Build model...
Training
Train on 969 samples, validate on 51 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

In [5]:
loss, acc = model.evaluate([tx, txq], [ty[:, 0], ty[:, 1]],
                           batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))



ValueError: too many values to unpack (expected 2)

In [52]:
tpreds = model.predict([tx, txq], verbose=1)



In [76]:
len(tpreds)
ty[0]
sum([1 for i in ty if i[0][11730] == 1 and i[1][11730] == 1])



155

In [63]:
y_pred = []
y_ans = []
for i in range(len(tpreds[0])):
    y0 = np.argmax(tpreds[0][i])
    y1 = np.argmax(tpreds[1][i])
    y_pred.append([y0, y1])
    ty0 = np.argmax(ty[i, 0])
    ty1 = np.argmax(ty[i, 1])
    y_ans.append([ty0, ty1])
print(y_pred)
print(y_ans)

[[11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730, 11730], [11730,

In [72]:
taccs = []
for i in range(len(y_pred)):
    tx_add = list(tx[i]) + ['yes', 'no', 'null']
    taccs.append(len(set(tx_add[y_pred[i][0]:y_pred[i][1]+1]).intersection(set(tx_add[y_ans[i][0]:y_ans[i][1]+1])))/float(len(set(tx_add[y_ans[i][0]:y_ans[i][1]+1]))))
tav_acc = np.mean(taccs)

print('Average test accuracy was ' + str(tav_acc))

Average test accuracy was 0.230998509687


In [73]:
print(ty[100])
print(tpreds[104])

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


IndexError: list index out of range

In [20]:
for i in range(len(tpreds)):
    print(float(sum([1 for j in ty[i] if j != 0])))

1.0
1.0
27.0
3.0
4.0
1.0
1.0
7.0
2.0
6.0
1.0
1.0
10.0
1.0
1.0
11.0
1.0
5.0
2.0
1.0
1.0
1.0
1.0
14.0
8.0
1.0
6.0
1.0
1.0
7.0
11.0
1.0
9.0
1.0
1.0
1.0
9.0
1.0
1.0
13.0
2.0
1.0
1.0
1.0
1.0
14.0
1.0
1.0
1.0
3.0
1.0
6.0
1.0
1.0
2.0
1.0
2.0
1.0
4.0
1.0
3.0
6.0
1.0
2.0
1.0
2.0
1.0
2.0
2.0
1.0
5.0
2.0
1.0
1.0
0.0
2.0
1.0
7.0
1.0
1.0
5.0
1.0
1.0
7.0
8.0
7.0
1.0
1.0
1.0
1.0
1.0
8.0
2.0
3.0
4.0
1.0
2.0
14.0
4.0
2.0
2.0
1.0
1.0
1.0
1.0
17.0
1.0
2.0
1.0
13.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
10.0
1.0
11.0
1.0
8.0
1.0
1.0
1.0
1.0
5.0
6.0
1.0
1.0
1.0
7.0
2.0
1.0
1.0
1.0
3.0
22.0
1.0
1.0
10.0
2.0
0.0
14.0
13.0
1.0
1.0
6.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
8.0
10.0
9.0
1.0
1.0
5.0
1.0
2.0
9.0
1.0
14.0
9.0
8.0
1.0
2.0
6.0
1.0
1.0
1.0
8.0
6.0
1.0
1.0
5.0
1.0
1.0
1.0
10.0
22.0
10.0
5.0
1.0
5.0
1.0
9.0
1.0
8.0
1.0
4.0
1.0
1.0
2.0
2.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
7.0
1.0
1.0
1.0
1.0
1.0
8.0
7.0
1.0
1.0
29.0
1.0
3.0
1.0
1.0
10.0
1.0
1.0
10.0
12.0
11.0
19.0
1.0
1.0
1.0
26.0
7.0
0.0
1.0
1.0
1.0
7.0
1.0
1.0


In [24]:
tpreds[0]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.51808214,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.56078947,  0.        ,
        2.16251731,  3.01238513,  3.13349175,  2.27461553,  2.97050619,
        0.        ,  1.03558469,  2.80897641,  3.94874954,  3.95648623,
        3.16882467,  4.30839443,  2.70530438,  3.31122398,  3.62634492,
        4.56028986,  2.56542516,  5.79891109,  5.04376698,  4.05345392,
        8.01044941], dtype=float32)