In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import time
import re
import copy

def load_babi_data(is_training, task, embedding_dim, embedding, word_id_dict):
    if is_training:
        filepath = "../../../datasets/facebook_babi/tasks_1-20_v1-2/en/qa1_single-supporting-fact_train.txt"
    else:
        filepath = "../../../datasets/facebook_babi/tasks_1-20_v1-2/en/qa1_single-supporting-fact_test.txt"
    
    file = open(filepath)
    num_words_in_longest_input_sentence = 0
    num_words_in_longest_question = 0
    num_sentences_in_each_chapter = []
    chapter_input = []
    data = []
    
    for line in file:
        items = re.sub('[?.]', '', line).lower().split()
        if items[-1].isdigit():
            data.append({'I': copy.deepcopy(chapter_input),
                     'Q': items[1:-2],
                     'A': [items[-2]]})
            num_sentences_in_each_chapter.append(len(chapter_input))
            num_words_in_longest_question = max(num_words_in_longest_question, len(items[1:-2]))
        else:
            if items[0] == '1':
                chapter_input = [items[1:]]
            else:
                chapter_input.append(items[1:])
            num_words_in_longest_input_sentence = max(num_words_in_longest_input_sentence, len(items[1:]))
    file.close()

    num_sentences_in_longest_input = max(num_sentences_in_each_chapter)
    num_chapters = len(data)

    data_inputs = np.zeros([num_chapters, num_sentences_in_longest_input, num_words_in_longest_input_sentence, embedding_dim])
    data_questions = np.zeros([num_chapters, num_words_in_longest_question, embedding_dim])
    data_answers = np.zeros([num_chapters])
    for chapter_index, chapter in enumerate(data):
        for sentence_index, sentence in enumerate(chapter['I']):
            data_inputs[chapter_index, sentence_index, 0:len(sentence), :] = embedding[[word_id_dict[word] for word in sentence]]
        data_questions[chapter_index, 0:len(chapter['Q']), :] = embedding[[word_id_dict[word] for word in chapter['Q']]]
        data_answers[chapter_index] = word_id_dict[chapter['A'][0]]
    
    return(data_inputs, data_questions, data_answers, 
           num_sentences_in_each_chapter, num_words_in_longest_input_sentence,
           num_words_in_longest_question, num_sentences_in_longest_input, 
           num_chapters)

def load_glove():
    vocab_size = 400000
    embedding_dim = 50
    file = open("../../../datasets/glove_6b/glove.6B.50d.txt")    
    embedding = np.ndarray([vocab_size, embedding_dim])
    word_id_dict = {}
    id_word_dict = {}
    id = 0
    for line in file:
        items = line.split(' ')
        word_id_dict[items[0]] = id
        id_word_dict[id] = items[0]
        embedding[id,:] = np.array([float(i) for i in items[1:]])
        id += 1
    file.close()
    return(embedding, word_id_dict, id_word_dict, vocab_size, embedding_dim)

def create_position_encoding(embedding_dim, num_words_in_longest_input_sentence):
    ## Position encoding
    position_encoding = np.ones([embedding_dim, num_words_in_longest_input_sentence], dtype=np.float32)

    ## Below (my implementation, from section 3.1 in https://arxiv.org/pdf/1603.01417.pdf) didn't work.
    # for j in range(1, num_words_in_longest_input_sentence+1):
    #     for d in range(1, embedding_dim+1):
    #         position_encoding[d-1, j-1] = (1 - j/num_words_in_longest_input_sentence) - (d/embedding_dim)*(1 - 2*j/num_words_in_longest_input_sentence)

    ## Copied from https://github.com/domluna/memn2n
    ls = num_words_in_longest_input_sentence+1
    le = embedding_dim+1
    for i in range(1, le):
        for j in range(1, ls):
            position_encoding[i-1, j-1] = (i - (le-1)/2) * (j - (ls-1)/2)
    position_encoding = 1 + 4 * position_encoding / embedding_dim / num_words_in_longest_input_sentence
    position_encoding = np.transpose(position_encoding)
    return(position_encoding)

In [2]:
class dmn_plus:
    def __init__(self):
        self.hidden_layer_size = 80
        self.num_steps = 3
        self.batch_size = 100
        self.dropout_probability = 0.9
        self.l2_regularization_lambda = 0.001
        self.learning_rate = 0.001
        self.num_epochs = 100
    
    def load_embeddings(self):
        self.embedding, self.word_id_dict, self.id_word_dict, self.vocab_size, self.embedding_dim = load_glove()
        
    def load_data(self, is_training = True, task = 1):
        self.data_inputs, self.data_questions, self.data_answers, self.num_sentences_in_each_chapter, self.num_words_in_longest_input_sentence, self.num_words_in_longest_question, self.num_sentences_in_longest_input, self.num_chapters = load_babi_data(is_training, task, self.embedding_dim, self.embedding, self.word_id_dict)
        self.position_encoding = create_position_encoding(self.embedding_dim, self.num_words_in_longest_input_sentence)

    def get_batch(self, batch_number):
        return {self.inputs: self.data_inputs[batch_number*self.batch_size: (batch_number+1)*self.batch_size],
                self.questions: self.data_questions[batch_number*self.batch_size: (batch_number+1)*self.batch_size],
                self.answers: self.data_answers[batch_number*self.batch_size: (batch_number+1)*self.batch_size],
                self.input_lengths: self.num_sentences_in_each_chapter[batch_number*self.batch_size: (batch_number+1)*self.batch_size]
               }

    def train(self, sess):
        start_time = time.time()
        for epoch in range(self.num_epochs):
            epoch_loss = epoch_num_correct = 0
            for batch_idx in range(self.num_chapters/self.batch_size):
                batch_loss, batch_num_correct, _ = sess.run((self.loss, self.num_correct, self.optimizer), 
                                                                            feed_dict = self.get_batch(batch_idx))
                epoch_loss += batch_loss
                epoch_num_correct += batch_num_correct
            print("Epoch %d: %.2f%% complete, %d mins, Loss: %.2f, Num correct: %d, Accuracy: %.2f%%" % (epoch, 
                                                                                   epoch*100.0/self.num_epochs,
                                                                                    (time.time() - start_time)/60,
                                                                                   epoch_loss, 
                                                                                    epoch_num_correct,
                                                                                    epoch_num_correct*100.0/self.num_chapters))
        end_time = time.time()
        print("Duration: %d mins" % int((end_time - start_time)/60))
        
    def test(self, sess):
        start_time = time.time()
        total_num_correct = 0
        for batch_idx in range(self.num_chapters/self.batch_size):
            batch_num_correct = sess.run(self.num_correct, feed_dict = self.get_batch(batch_idx))
            total_num_correct += batch_num_correct
        print("%d mins, Num correct: %d, Accuracy: %.2f%%" % ((time.time() - start_time)/60,
                                                              total_num_correct,
                                                              total_num_correct*100.0/self.num_chapters))
        
    def create_tensorflow_graph(self):
        self.inputs = tf.placeholder(tf.float32, [self.batch_size, self.num_sentences_in_longest_input, self.num_words_in_longest_input_sentence, self.embedding_dim])
        self.questions = tf.placeholder(tf.float32, [self.batch_size, self.num_words_in_longest_question, self.embedding_dim])
        self.answers = tf.placeholder(tf.int32, [self.batch_size])
        self.input_lengths = tf.placeholder(tf.int32, [self.batch_size])

        ## Question module
        with tf.variable_scope('question_module'):
            question_gru_cell = tf.contrib.rnn.GRUCell(self.hidden_layer_size)
            _, question_vector = tf.nn.dynamic_rnn(question_gru_cell,
                                                  self.questions,
                                                  dtype=tf.float32)

        ## Input module
        with tf.variable_scope('input_module'):

            positionally_encoded_inputs = tf.reduce_sum(self.inputs*self.position_encoding, 2)

            input_forward_gru_cell = tf.contrib.rnn.GRUCell(self.hidden_layer_size)
            input_backward_gru_cell = tf.contrib.rnn.GRUCell(self.hidden_layer_size)
            input_module_output, _ = tf.nn.bidirectional_dynamic_rnn(input_forward_gru_cell,
                                                                    input_backward_gru_cell,
                                                                    positionally_encoded_inputs,
                                                                    sequence_length = self.input_lengths,
                                                                    dtype = tf.float32)
            input_fact_vectors = tf.add(input_module_output[0], input_module_output[1])
            input_fact_vectors = tf.nn.dropout(input_fact_vectors, self.dropout_probability)

        ## Episodic Memory module
        with tf.variable_scope('episodic_memory_module'):
            previous_memory = question_vector
            for step in range(self.num_steps):
                attentions = []
                for fact_index, fact_vector in enumerate(tf.unstack(input_fact_vectors, axis = 1)):
                    reuse = bool(step) or bool(fact_index)
                    with tf.variable_scope("attention", reuse = reuse):
                        z = tf.concat([tf.multiply(fact_vector, question_vector), 
                                       tf.multiply(fact_vector, previous_memory),
                                       tf.abs(tf.subtract(fact_vector, question_vector)),
                                       tf.abs(tf.subtract(fact_vector, previous_memory))], 1)
                        attention = tf.contrib.layers.fully_connected(z,
                                                                    self.embedding_dim,
                                                                    activation_fn=tf.nn.tanh,
                                                                    reuse=reuse, scope="fc1")
                        attention = tf.contrib.layers.fully_connected(attention,
                                                                    1,
                                                                    activation_fn=None,
                                                                    reuse=reuse, scope="fc2")
                        attentions.append(tf.squeeze(attention))
                attentions = tf.expand_dims(tf.nn.softmax(tf.transpose(tf.stack(attentions))), axis=-1)
                reuse = True if step > 0 else False
                # soft attention
                context_vector = tf.reduce_sum(tf.multiply(input_fact_vectors, attentions), axis = 1)
                with tf.variable_scope("step%d"%step):
                    previous_memory = tf.contrib.layers.fully_connected(tf.concat([previous_memory, context_vector, question_vector], axis = 1),
                                                                                self.hidden_layer_size,
                                                                                activation_fn=tf.nn.relu)
            previous_memory = tf.nn.dropout(previous_memory, self.dropout_probability)

        ## Answer module
        with tf.variable_scope('answer_module') as scope:
            logits = tf.contrib.layers.fully_connected(inputs = tf.concat([previous_memory, question_vector], axis = 1),
                                                      num_outputs = self.vocab_size,
                                                      activation_fn = None)

            ## Loss and metrics
            self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits, labels = self.answers))

            # add l2 regularization for all variables except biases
            for v in tf.trainable_variables():
                if not 'bias' in v.name.lower():
                    self.loss += self.l2_regularization_lambda * tf.nn.l2_loss(v)

            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

            prediction = tf.cast(tf.argmax(tf.nn.softmax(logits), 1), 'int32')
            self.num_correct = tf.reduce_sum(tf.cast(tf.equal(prediction, self.answers), tf.int32))

In [9]:
model = dmn_plus()
model.load_embeddings()
model.load_data(is_training=True, task=1)
model.create_tensorflow_graph()
sess = tf.Session()
sess.run(tf.global_variables_initializer())
model.train(sess)

Epoch 0: 0.00% complete, 0 mins, Loss: 12146.82, Num correct: 148, Accuracy: 14.80%
Epoch 1: 1.00% complete, 0 mins, Loss: 5289.31, Num correct: 166, Accuracy: 16.60%
Epoch 2: 2.00% complete, 0 mins, Loss: 1973.65, Num correct: 158, Accuracy: 15.80%
Epoch 3: 3.00% complete, 0 mins, Loss: 2087.26, Num correct: 147, Accuracy: 14.70%
Epoch 4: 4.00% complete, 0 mins, Loss: 2108.23, Num correct: 141, Accuracy: 14.10%
Epoch 5: 5.00% complete, 0 mins, Loss: 1990.63, Num correct: 151, Accuracy: 15.10%
Epoch 6: 6.00% complete, 0 mins, Loss: 1901.46, Num correct: 150, Accuracy: 15.00%
Epoch 7: 7.00% complete, 0 mins, Loss: 1906.38, Num correct: 139, Accuracy: 13.90%
Epoch 8: 8.00% complete, 0 mins, Loss: 1893.03, Num correct: 142, Accuracy: 14.20%
Epoch 9: 9.00% complete, 0 mins, Loss: 1874.03, Num correct: 162, Accuracy: 16.20%
Epoch 10: 10.00% complete, 0 mins, Loss: 1870.01, Num correct: 140, Accuracy: 14.00%
Epoch 11: 11.00% complete, 0 mins, Loss: 1860.80, Num correct: 158, Accuracy: 15.80%

Epoch 98: 98.00% complete, 1 mins, Loss: 25.55, Num correct: 998, Accuracy: 99.80%
Epoch 99: 99.00% complete, 1 mins, Loss: 33.47, Num correct: 994, Accuracy: 99.40%
Duration: 1 mins


In [11]:
model.load_data(is_training=False, task=1)
model.test(sess)

0 mins, Num correct: 973, Accuracy: 97.30%
