In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import random
import re
import time

# Config

In [2]:
vocab_size = 400000
embedding_dim = 50
hidden_layer_size = 100
num_rows = 1000
num_steps = 3
num_epochs = 3

# Preprocessing

In [3]:
def read_glove(filename):
    file = open(filename)    
    embedding = np.ndarray([vocab_size, embedding_dim])
    word_id_dict = {}
    id = 0
    for line in file:
        items = line.split(' ')
        word_id_dict[items[0]] = id
        embedding[id,:] = np.array([float(i) for i in items[1:]])
        id += 1
    file.close()
    return(embedding, word_id_dict)

embedding, word_id_dict = read_glove("../../datasets/glove.6B/glove.6B.50d.txt")

def read_data(filename):
    file = open(filename)
    chapter_input = []
    data = []
    for line in file:
        items = re.sub('[?.]', '', line).lower().split()
        if items[0] == '1':
            chapter_input = items[1:] + ['.']
        elif items[-1].isdigit():
            data.append({'I': chapter_input,
                         'Q': items[1:-2],
                         'A': [items[-2]]})
        else:
            chapter_input = chapter_input + items[1:] + ['.']
    file.close()
    return(data)

def max_len(data, iqa):
    max_len = 0
    for i in data:
        max_len = max(max_len, len(i[iqa]))
    return(max_len)

def embed_and_pad(data):
    inputs = np.zeros([len(data), max_len(data, 'I'), embedding_dim])
    questions = np.zeros([len(data), max_len(data, 'Q'), embedding_dim])
    for index, row in enumerate(data):
        inputs[index,0:len(row['I']),:] = embedding[[word_id_dict[token] for token in row['I']]]
        questions[index,0:len(row['Q']),:] = embedding[[word_id_dict[token] for token in row['Q']]]
    return((inputs, questions))

def get_answer_index(data):
    answers = np.zeros(num_rows)
    for index, row in enumerate(data):
        answers[index] = word_id_dict[row['A'][0]]
    return(answers)

def get_input_sequence_lengths(data):
    input_sequence_lengths = []
    for i in data:
        input_sequence_lengths.append(len(i['I']))
    return(input_sequence_lengths)

def get_input_period_boolean(data):
    input_period_boolean = np.zeros((num_rows, max_input_len), dtype=bool)
    for index, row in enumerate(data):
        input_period_boolean[index, [i for i, j in enumerate(row['I']) if j=='.']] = True
    return(input_period_boolean)

def get_max_facts(input_period_boolean):
    max_facts = max([sum(i) for i in input_period_boolean])
    return(max_facts)

# DMN Implementation

In [4]:
## Placeholders
inputs = tf.placeholder(tf.float32, shape=[num_rows, max_input_len, embedding_dim])
questions = tf.placeholder(tf.float32, shape=[num_rows, max_question_len, embedding_dim])
answers = tf.placeholder(tf.int32, shape=[num_rows])
periods = tf.placeholder(tf.bool, shape=[num_rows, max_input_len])

gru_cell = tf.contrib.rnn.GRUCell(hidden_layer_size)

## Question module
with tf.variable_scope('question_module'):
    _, q = tf.nn.dynamic_rnn(gru_cell,
                                  questions,
                                  dtype=tf.float32)
    
## Input module
with tf.variable_scope('input_module'):
    i_output, _ = tf.nn.dynamic_rnn(gru_cell,
                                          inputs,
                                          dtype=tf.float32,
                                          sequence_length=input_sequence_lengths)
c = []
for index in range(num_rows):
    states_at_periods = tf.boolean_mask(i_output[index,:,:], periods[index,:])
    padding = tf.zeros([max_facts - tf.shape(states_at_periods)[0], hidden_layer_size])
    c.append(tf.concat([states_at_periods, padding], 0))
c = tf.unstack(tf.transpose(tf.stack(c), perm=[1,0,2]), num = max_facts)

## Episodic Memory module
with tf.variable_scope('episodic_memory_module') as scope:
    m_i = q
    for step in range(num_steps):
        h_t = tf.zeros_like(c[0])
        e_i = tf.zeros_like(c[0])
        for c_t in c:
            # calculate g
            z = tf.concat([c_t, m_i, q, 
                           tf.multiply(c_t, q), 
                           tf.multiply(c_t, m_i),
                           tf.abs(tf.subtract(c_t, q)),
                           tf.abs(tf.subtract(c_t, m_i))], 1) # need to add 2 more terms in there (V2)
            layer1 = tf.contrib.layers.fully_connected(inputs = z,
                                                      num_outputs = hidden_layer_size,
                                                      activation_fn = tf.nn.tanh,
                                                      reuse = True,
                                                      scope = 'g_layer_1')
            g = tf.contrib.layers.fully_connected(inputs = layer1,
                                                      num_outputs = 1,
                                                      activation_fn = tf.nn.sigmoid,
                                                      reuse = True,
                                                      scope = 'g_layer_2')
            # from section 4.1
            e_i = tf.add(e_i, tf.multiply(tf.nn.softmax(g), c_t))
    #             # compute episode for pass i
    #             h_t = tf.multiply(g, gru_cell(c_t, h_t)) + tf.multiply(tf.subtract(1, g), h_t)
    #         # episode is the last hidden state
    #         e_i = h_t
        m_i = gru_cell(e_i, m_i)[0]
        scope.reuse_variables()
        
## Answer module
with tf.variable_scope('answer_module'):
    logits = tf.contrib.layers.fully_connected(inputs = m_i,
                                              num_outputs = vocab_size,
                                              activation_fn = None)
    
## Loss and metrics
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits, labels = answers)
loss = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(0.001).minimize(loss)
prediction = tf.cast(tf.argmax(logits, 1), 'int32')
num_correct = tf.reduce_sum(tf.cast(tf.equal(prediction, answers), tf.int32))
accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, answers), tf.float32))

ValueError: None values not supported.

# Training

In [None]:
data = read_data("../../datasets/facebook_babi/tasks_1-20_v1-2/en/qa1_single-supporting-fact_train.txt")
max_input_len = max_len(data, 'I')
max_question_len = max_len(data, 'Q')
max_answer_len = max_len(data, 'A')
data_inputs, data_questions = embed_and_pad(data)
data_answers = get_answer_index(data)
input_sequence_lengths = get_input_sequence_lengths(data)
input_period_boolean = get_input_period_boolean(data)
max_facts = get_max_facts(input_period_boolean)

In [25]:
start_time = time.time()
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for epoch in range(num_epochs):
    epoch_loss, _, epoch_num_correct, epoch_accuracy = sess.run((loss, optimizer, num_correct, accuracy), 
                                                                feed_dict={inputs: data_inputs, 
                                                                        questions: data_questions, 
                                                                        answers: data_answers, 
                                                                        periods: input_period_boolean})
    print("Epoch %d: %.2f%% complete, %d mins, Loss: %.9f, Num correct: %d, Accuracy: %.2f%%" % (epoch, 
                                                                       epoch/num_epochs*100,
                                                                        (time.time() - start_time)/60,
                                                                       epoch_loss, 
                                                                       epoch_num_correct,
                                                                        epoch_accuracy*100))
end_time = time.time()
print("Duration: %d mins" % int((end_time - start_time)/60))

Epoch 0: 0.00% complete, 4 mins, Loss: 12.906951904, Num correct: 0, Accuracy: 0.0000
Epoch 1: 33.33% complete, 6 mins, Loss: 12.835850716, Num correct: 155, Accuracy: 0.1550
Epoch 2: 66.67% complete, 8 mins, Loss: 12.753013611, Num correct: 155, Accuracy: 0.1550
Duration: 8 mins


# Testing

In [None]:
data = read_data("../../datasets/facebook_babi/tasks_1-20_v1-2/en/qa1_single-supporting-fact_test.txt")
max_input_len = max_len(data, 'I')
max_question_len = max_len(data, 'Q')
max_answer_len = max_len(data, 'A')
data_inputs, data_questions = embed_and_pad(data)
data_answers = get_answer_index(data)
input_sequence_lengths = get_input_sequence_lengths(data)
input_period_boolean = get_input_period_boolean(data)
max_facts = get_max_facts(input_period_boolean)

In [None]:
test_prediction = sess.run(prediction, feed_dict={inputs: data_inputs, 
                                                        questions: data_questions, 
                                                        answers: data_answers, 
                                                        periods: input_period_boolean})