In [1]:
import numpy as np
import tensorflow as tf
import datetime
import os
import re
import tarfile
import functools

from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dropout, Activation, Dense, LSTM, Input, Add, Dot, concatenate, Embedding, Permute

In [2]:
device_name = tf.test.gpu_device_name()
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


### import data

In [3]:
def tokenize(line):
	line = re.sub(r'[^a-zA-Z]', ' ', line)
	line = line.replace('   ', ' ').replace('  ', ' ')

	return line

def parse_stories(lines, only_supporting=False):
	data = []
	story = []

	for line in lines:
		line = line.decode('utf-8')
		line = line.strip()

		nid, line = line.split(' ', 1)
		nid = int(nid)

		if nid == 1:
			story = []
		if '\t' in line:
			q, a, sp = line.split('\t')
			q = tokenize(q)

			substory = None

			if only_supporting:
				sp = map(int, sp.split())
				substory = [story[i - 1] for i in sp]

			else:
				substory = [x for x in story if x]

			data.append((substory, q, a))
			story.append('')

		else:
			sent = tokenize(line)
			story.append(sent)

	return data

def get_stories(path, only_supporting=False):

    data = parse_stories(path.readlines(), only_supporting=only_supporting)
    flatten = lambda data: functools.reduce(lambda x, y: x + y, data)
    data = [(flatten(story), q, a) for story, q, a in data]
    return data

In [4]:
path = tf.keras.utils.get_file('babi-tasks-v1-2.tar.gz', origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz')

tar = tarfile.open(path)

challenges = {
    # QA1 with 10,000 samples
    'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt',
    # QA2 with 10,000 samples
    'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt',
}
challenge_type = 'single_supporting_fact_10k'
challenge = challenges[challenge_type]

print('Extracting stories for the challenge:', challenge_type)
# train_inputs, train_queries, train_answers = get_stories(tar.extractfile(challenge.format('train')))
# test_inputs, test_queries, test_answers = get_stories(tar.extractfile(challenge.format('test')))

train = get_stories(tar.extractfile(challenge.format('train')), only_supporting=True)
test = get_stories(tar.extractfile(challenge.format('test')), only_supporting=True)

print('train size: {}, test size: {}'.format(len(train), len(test)))

Extracting stories for the challenge: single_supporting_fact_10k
train size: 10000, test size: 1000


In [5]:
for i in np.random.randint(0, 10000, size=3):
    print('+'*30)
    print(train[i])

++++++++++++++++++++++++++++++
('Mary went back to the kitchen ', 'Where is Mary ', 'kitchen')
++++++++++++++++++++++++++++++
('Sandra travelled to the kitchen ', 'Where is Sandra ', 'kitchen')
++++++++++++++++++++++++++++++
('Daniel went to the hallway ', 'Where is Daniel ', 'hallway')


In [6]:
def tokenize_data(data):
    corpus = [' '.join((s, q, a)) for s,q,a in data]

    tokenizer = Tokenizer(filters='')
    tokenizer.fit_on_texts(corpus)

    return tokenizer

tokenizer = tokenize_data(train+test)

In [7]:
vocab_size = len(tokenizer.word_index) + 1
story_maxlen = max(map(len, (x for x,_,_ in train+test)))
query_maxlen = max(map(len, (x for _,x,_ in train+test)))
# answer_maxlen = max(map(len, (x for _,_,x in train+test)))
answer_maxlen = vocab_size

print('Vocab size:', vocab_size, 'unique words')
print('Story max length:', story_maxlen, 'words')
print('Query max length:', query_maxlen, 'words')
print('answer max length:', answer_maxlen, 'words')

Vocab size: 20 unique words
Story max length: 33 words
Query max length: 16 words
answer max length: 20 words


In [8]:
def padding_data(data, story_maxlen, query_maxlen, answer_maxlen, tokenizer):
    story, ques, ans = [], [], []

    word_idx = tokenizer.word_index
    
    for s, q, a in data:
        story.append(s)
        ques.append(q)
        
        y = np.zeros(len(word_idx)+1)
        y[word_idx[a]] = 1
        ans.append(y)

    story_seq = tokenizer.texts_to_sequences(story)
    ques_seq = tokenizer.texts_to_sequences(ques)
    ans_seq = np.array(ans)
    
    return pad_sequences(story_seq, maxlen=story_maxlen), pad_sequences(ques_seq, maxlen=query_maxlen), ans_seq

train_inputs, train_queries, train_answers = padding_data(train, story_maxlen, query_maxlen, answer_maxlen, tokenizer)
test_inputs, test_queries, test_answers = padding_data(test, story_maxlen, query_maxlen, answer_maxlen, tokenizer)
print(train_inputs.shape, train_queries.shape, train_answers.shape)
print(test_inputs.shape, test_queries.shape, test_answers.shape)

(10000, 33) (10000, 16) (10000, 20)
(1000, 33) (1000, 16) (1000, 20)


In [9]:
with tf.device('/device:GPU:0'):
    input_sequence = Input((story_maxlen,))
    question = Input((query_maxlen,))

    #HyperParameters
    BATCH_SIZE = 32
    EPOCHS = 10

    input_encoder_m = Sequential()

    input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64))
    input_encoder_m.add(Dropout(0.3))

    input_encoder_c = Sequential()
    input_encoder_c.add(Embedding(input_dim=vocab_size, output_dim=query_maxlen))
    input_encoder_c.add(Dropout(0.3))

    question_encoder = Sequential()
    question_encoder.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=query_maxlen))
    question_encoder.add(Dropout(0.3))

    # encoder the input sequence
    input_encoded_m = input_encoder_m(input_sequence)
    input_encoded_c = input_encoder_c(input_sequence)
    question_encoded = question_encoder(question)

    # print(input_encoded_m.shape, input_encoded_c.shape, question_encoded.shape)

    # compute the match between input sequence and question sequence
    match = Dot(axes=(2,2))([input_encoded_m, question_encoded])
    match = Activation('softmax')(match)

    # print('match shape:', match.shape)

    response = Add()([match, input_encoded_c])
    response = Permute((2,1))(response)

    # print('response shape:', response.shape)

    answer = concatenate([response, question_encoded])

    # print('after concat:', answer.shape)

    answer = LSTM(64, activation='relu')(answer)
    answer = Dropout(0.3)(answer)

    answer = Dense(answer_maxlen)(answer)
    answer = Activation('softmax')(answer)

    # print('after core of model:', answer.shape)

    model = Model([input_sequence, question], answer)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 33)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 16)]         0                                            
__________________________________________________________________________________________________
sequential (Sequential)         (None, None, 64)     1280        input_1[0][0]                    
__________________________________________________________________________________________________
sequential_2 (Sequential)       (None, 16, 64)       1280        input_2[0][0]                    
_______________________________________________________________________________________

In [10]:
with tf.device('/device:GPU:0'):
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

    history = model.fit([train_inputs, train_queries], train_answers,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_data=([test_inputs, test_queries], test_answers)
    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
def convert(tensor, tokenizer):
    idx_word = tokenizer.index_word
    string = ''

    for i in tensor[0].numpy():
        if i == 0:
            continue
        string += idx_word[i] + ' '

    return string

In [12]:
for i in np.random.randint(0, len(test), size=10):
    query_text, story_text = tf.expand_dims(test_queries[i], 0), tf.expand_dims(test_inputs[i], 0)
    pre = model.predict((story_text, query_text))

    valuemax = pre.argmax()

    print('+'*50)
    print('story:', convert(story_text, tokenizer))
    print('question:', convert(query_text, tokenizer))
    print('answer:', tokenizer.index_word[valuemax])
    print('probability of the answer:', pre[0][valuemax])

++++++++++++++++++++++++++++++++++++++++++++++++++
story: daniel went to the office 
question: where is daniel 
answer: office
probability of the answer: 1.0
++++++++++++++++++++++++++++++++++++++++++++++++++
story: john went back to the office 
question: where is john 
answer: office
probability of the answer: 1.0
++++++++++++++++++++++++++++++++++++++++++++++++++
story: mary travelled to the garden 
question: where is mary 
answer: garden
probability of the answer: 0.9999763
++++++++++++++++++++++++++++++++++++++++++++++++++
story: mary journeyed to the bathroom 
question: where is mary 
answer: bathroom
probability of the answer: 1.0
++++++++++++++++++++++++++++++++++++++++++++++++++
story: daniel journeyed to the kitchen 
question: where is daniel 
answer: kitchen
probability of the answer: 1.0
++++++++++++++++++++++++++++++++++++++++++++++++++
story: mary went back to the kitchen 
question: where is mary 
answer: kitchen
probability of the answer: 1.0
+++++++++++++++++++++++++++++