# Question Answering using GRU with bAbI Dataset

### We are going to implement a question answering algorithm to solve the first 20 tasks in the bAbI dataset. These tasks are a good start towards a more sophisticated automatic text understanding and reasoning.

For a more comprehensive info about the dataset visit https://research.fb.com/downloads/babi/

##### The principal components are the embedding matrix created using Glove Pretrained Word Embedding, a GRU layer which takes a story and the relative question concatenated in a sequencial way and a softmax over all the words in the vocabulary

Let's import some useful library used through the notebook

In [None]:
import tensorflow as tf
import numpy as np
import regex as re
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from datetime import datetime
import zipfile
from urllib.request import urlretrieve
import os

In [None]:
def unzip_single_file(zip_file_name, output_file_name):
    '''
    Unzip a file
    
    Input:
    zip_file_name: path file to unzip
    output_file_name: path to unzip
    '''
    
    with open(output_file_name, 'wb') as out_file:
        zipped = zipfile.ZipFile(zip_file_name)
        for info in zipped.infolist():
            if output_file_name in info.filename:
                out_file.write(zipped.open(info).read())
                return

In [None]:
# Retrive and unzip glove word embedding

glove_zip_file = "glove.6B.zip"
glove_vectors_file = "glove.6B.50d.txt"

urlretrieve ("http://nlp.stanford.edu/data/glove.6B.zip", glove_zip_file)
unzip_single_file(glove_zip_file, glove_vectors_file)

The following function is used to create an embedding matrix for the words find in the dataset

In [None]:
def get_embedding_matrix(emb_file, word_index, embedding_dim):
    '''
    Create an embedding matrix
    
    Input:
    emb_file: string containing the path to the embedding file
    word_index: dictionary with words as key and index for the word as value (e.g. {'word1': index1, 'word2':index2 ..})
    embedding_dim: embedding dimension
    
    Output: embedding matrix. The n° row contains the embedding of the n° index word
    '''
    
    # Load the embedding file
    print('Loading embedding file', emb_file, '..')
    embeddings_index = {}
    f = open(emb_file, encoding="utf8")
    
    for line in f:
        try:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        except ValueError:
            continue

    f.close()
    
    print('Found %s word vectors.' % len(embeddings_index))
    
    # Create the embedding matrix needed for our dataset
    global nb_words 
    nb_words = min(vocabulary_size, len(word_index)+1)
    print('nb_words',nb_words)
    
    # Initialized the embedding matrix with normal random value
    all_embs = np.hstack(word_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_dim))
    
    count = 0
    # For every word on the dataset, find the respective pre-trained word vectors.
    for word, i in word_index.items():
        if i >= vocabulary_size: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            count += 1

    print('Found', count, 'words corrispondances')

    return embedding_matrix

The following methods preprocess the dataset and convert a string to a sequence of integers 

In [None]:
def get_dataset(name):
    '''
    Return the Babl dataset
    
    Input
    name: path of the babl task
    
    Output: Babl dataset
    '''
    
    dataset = []
    
    with open(name) as file:
        story = []
        
        for line in file.readlines():
            fp, sp = line.split(' ', 1)
            if fp == '1':
                story = []
                story.append(sp)
            elif '\t' not in sp:
                story.append(sp)
            else:
                qst, answer, supp_facts = line.split('\t')
                dataset.append([story[:], qst.split(' ', 1)[1], qst.split(' ', 1)[0], answer, supp_facts])
    
    return dataset
       
def preprocess_dataset(dataset):
    '''
    Preprocess the babI dataset and return the story, the question and the answer
    
    Input
    dataset: Babl dataset
    
    Output: the Babl dataset preprocessed
    '''
    
    preproc_dataset = []
    
    for stry, qst, _, ans, _ in dataset:
        stry = re.sub('\n', ' ', ''.join(stry))
        preproc_dataset.append((stry.lower(), qst.lower(), re.sub(',','',ans.lower())))
        
    return preproc_dataset

def tokenize(tr_dataset, vl_dataset):
    '''
    Given a train and validation dataset, return the respective stories, questions and answers as sequence of indices. Every index represent the position of the give word in the embedding matrix 
    
    Input:
    tr_dataset: Train dataset. Should be an array containing as column the story, question and answer. 
    vl_dataset: Validation dataset. Should be an array containing as column the story, question and answer.
    
    Output:
    1) Tokenizer
    2) Sequences of indices corresponding to the train's stories
    3) Sequences of indices corresponding to the train's questions
    4) Sequences of answers corresponding to the train's answers
    5) Sequences of indices corresponding to the validation's stories
    6) Sequences of indices corresponding to the validation's questions
    7) Sequences of answers corresponding to the validation's answers
    '''
    
    # Tokenize both train & validation set
    tokenizer = Tokenizer(num_words=vocabulary_size, lower=True, filters='.?')
    tokenizer.fit_on_texts([x+y+z for x, y, z in tr_dataset]+[x+y+z for x, y, z in vl_dataset])

    # Convert train texts into sequences
    stories_tokenized_train = tokenizer.texts_to_sequences([x for x, _, _ in tr_dataset])
    stories_seq = pad_sequences(stories_tokenized_train, maxlen=story_max_len)
    
    questions_tokenized_train = tokenizer.texts_to_sequences([x for _, x, _ in tr_dataset])
    questions_seq = pad_sequences(questions_tokenized_train, maxlen=question_max_len)
    
    answers_tokenized_train = tokenizer.texts_to_sequences([x for _, _, x in tr_dataset])
    
    # Convert validation texts into sequences
    valid_stories_tokenized_train = tokenizer.texts_to_sequences([x for x, _, _ in vl_dataset])
    valid_stories_seq = pad_sequences(valid_stories_tokenized_train, maxlen=story_max_len)
    
    valid_questions_tokenized_train = tokenizer.texts_to_sequences([x for _, x, _ in vl_dataset])
    valid_questions_seq = pad_sequences(valid_questions_tokenized_train, maxlen=question_max_len)
    
    valid_answers_tokenized_train = tokenizer.texts_to_sequences([x for _, _, x in vl_dataset])
    
    return tokenizer, stories_seq, questions_seq, np.squeeze(np.array(answers_tokenized_train)),\
            valid_stories_seq, valid_questions_seq, np.squeeze(np.array(valid_answers_tokenized_train))

Define some useful constants 

In [None]:
vocabulary_size = 2000
story_max_len = 500
question_max_len = 8
EMBEDDING_DIM = 50
batch_size = 64

# Model hyperparameters
learning_rate = 0.002
iterations = 10000
test_on_valid = 800
gru_num_units = 100
gru_rev_num_units = 50
drop_input_prob = 0.9
drop_output_prob = 0.9
print_summary_step = 8
summary_on_valid = 200

In [None]:
# Acquire, preprocess and tokenize 1-20 bAbI tasks.
train_dataset = []
valid_dataset = []

root = 'tasks_1-20_v1-2/en/'
qa_tasks = ['qa1_single-supporting-fact_{}.txt', 'qa2_two-supporting-facts_{}.txt', 'qa3_three-supporting-facts_{}.txt', 'qa4_two-arg-relations_{}.txt', 'qa5_three-arg-relations_{}.txt', 
            'qa6_yes-no-questions_{}.txt', 'qa7_counting_{}.txt', 'qa8_lists-sets_{}.txt', 'qa9_simple-negation_{}.txt', 'qa10_indefinite-knowledge_{}.txt', 
            'qa11_basic-coreference_{}.txt','qa12_conjunction_{}.txt', 'qa13_compound-coreference_{}.txt', 'qa14_time-reasoning_{}.txt', 'qa15_basic-deduction_{}.txt',
            'qa16_basic-induction_{}.txt', 'qa17_positional-reasoning_{}.txt', 'qa18_size-reasoning_{}.txt', 'qa19_path-finding_{}.txt', 'qa20_agents-motivations_{}.txt']

for qa_task in qa_tasks:
    if train_dataset == []:
        train_dataset = get_dataset(root + qa_task.format('train'))
        valid_dataset = get_dataset(root + qa_task.format('test'))
    else:
        train_dataset = np.concatenate([train_dataset, get_dataset(root + qa_task.format('train'))], axis=0)
        valid_dataset = np.concatenate([valid_dataset, get_dataset(root + qa_task.format('test'))], axis=0)

print('len train_dataset:', len(train_dataset))
print('len valid_dataset:', len(train_dataset))

train_dataset = preprocess_dataset(train_dataset)
valid_dataset = preprocess_dataset(valid_dataset)

# Tokenize and convert the dataset from a list of string to a list of index sequences
tokenizer, stories_seq, questions_seq, answers_seq, valid_stories_seq, valid_question_seq, valid_answer_seq = tokenize(train_dataset, valid_dataset)

# Create an embedding matrix using glove50 pretrained word embeddings
glove_embedding_matrix = get_embedding_matrix(glove_vectors_file, tokenizer.word_index, EMBEDDING_DIM)

## <center> NETWORK MODEL</center>
<center>![title](images/gru_model.png)</center>

In [None]:
# Tensorboard variables
now = datetime.utcnow().strftime("%Y%m%d-%H-%M-%S")
root_dir = "tf_logs"
logdir = "{}/run-{}/".format(root_dir, now)

# Create the graph

tf.reset_default_graph()

init = tf.global_variables_initializer()

# Initialize the cells
gru_cell = tf.nn.rnn_cell.GRUCell(gru_num_units)
gru_drop = tf.contrib.rnn.DropoutWrapper(gru_cell, drop_input_prob, drop_output_prob)

gru_cell_rev = tf.nn.rnn_cell.GRUCell(gru_rev_num_units)
gru_drop_rev = tf.contrib.rnn.DropoutWrapper(gru_cell_rev, drop_input_prob, drop_output_prob)

# Create two placeholders for the input
X_story = tf.placeholder(tf.int32, [None, story_max_len], 'X_story')
X_question = tf.placeholder(tf.int32, [None, question_max_len], 'X_question')


with tf.variable_scope('embeddings'):
    # initialize the embedding variable like the matrix previously created
    glove_weights_initializer = tf.constant_initializer(np.array(glove_embedding_matrix))
    embedding_weights = tf.get_variable('embedding_weights', shape=(nb_words, EMBEDDING_DIM), initializer=glove_weights_initializer, trainable=False)

    # embeddings lookup for the story
    ## tf.nn.embedding_lookup: lookup for every index in X_story, the respective word embedding in the embedding_weights variable
    embedding_story = tf.nn.embedding_lookup(embedding_weights, X_story)

    # embeddings lookup for the question
    embedding_question = tf.nn.embedding_lookup(embedding_weights, X_question)

    emb_qa = tf.concat([embedding_story,embedding_question], axis=1)

with tf.variable_scope('GRU'):
    # GRU layer
    outputs_conc, state_conc = tf.nn.dynamic_rnn(gru_drop, emb_qa, dtype=tf.float32)

    # Last layer
    attend_init = tf.random_normal_initializer(stddev=0.1)
    w = tf.get_variable("w", [100, nb_words], tf.float32, initializer=attend_init)
    b = tf.get_variable("b", [nb_words], tf.float32, initializer=attend_init)
    o = tf.matmul(state_conc, w) + b
    summ_out = tf.summary.histogram('output', o)

y = tf.placeholder(tf.int32, [None], 'y')
y_label = tf.one_hot(y, nb_words)

with tf.variable_scope('loss'):
    # Evaluate the Softmax Cross Entropy loss
    loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_label, logits=o)
    summ_losses = tf.summary.histogram('losses', loss)
    loss = tf.reduce_mean(loss)
    summ_train_loss = tf.summary.scalar('train_loss', loss)
    summ_valid_loss = tf.summary.scalar('valid_loss', loss)

with tf.variable_scope('train'):
    # Optimizer and training phase
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

with tf.variable_scope('correct'):
    # Compute the accuracy
    y_p_labels = tf.squeeze(tf.argmax(y_label, axis=-1))
    o_p_labels = tf.argmax(o, axis=-1)

    correct = tf.cast(tf.equal(y_p_labels, o_p_labels), tf.int32)

    summ_correct = tf.summary.histogram('correct', correct)
    summ_valid_correct = tf.summary.scalar('valid_acc', tf.reduce_mean(tf.cast(correct, tf.float32)))
    
# tensorbaord initialization
mrg_train_summary = tf.summary.merge([summ_losses, summ_train_loss, summ_out, summ_correct])
mrg_valid_summary = tf.summary.merge([summ_valid_loss, summ_valid_correct])
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    embedding_weights.initializer.run()
      
    # Shuffle arrays
    shuffled = np.arange(len(stories_seq))
    np.random.shuffle(shuffled)
    stories_seq = stories_seq[shuffled]
    questions_seq = questions_seq[shuffled]
    answers_seq = answers_seq[shuffled]
    
    # Train and evaluate the model training_it times
    for it in tqdm(range(0, iterations)):
        # Create the batch
        batch = np.random.randint(len(stories_seq), size=batch_size)
        stories_batch = stories_seq[batch]
        questions_batch = questions_seq[batch]
        answers_batch = answers_seq[batch]
        
        # Train the network and retrive the loss
        batch_loss, _ = sess.run([loss, training_op], feed_dict={X_story:stories_batch, X_question:questions_batch, y:answers_batch})
        
        # Update the tensorboard summary
        if it % print_summary_step == 0:
            train_summary = mrg_train_summary.eval(feed_dict={X_story:stories_batch, X_question:questions_batch, y:answers_batch})
            file_writer.add_summary(train_summary, it)

        # Test on the validation set
        if it % test_on_valid == 0:
            valid_losses = []
            acc_values = []
            
            # Compute the accuracy and loss for every tasks (1-20th)
            for it_v in range(0,len(valid_stories_seq),1000):
                valid_loss, acc_value = sess.run([loss, correct], feed_dict={X_story:valid_stories_seq[it_v:it_v+1000], X_question:valid_question_seq[it_v:it_v+1000], y:valid_answer_seq[it_v:it_v+1000]})
                print('Task',int((it_v/1000)+1), 'Accuracy:', np.round(np.mean(acc_value)*100,2), '%')
                valid_losses.append(valid_loss)
                acc_values.append(np.mean(acc_value))

            # Print the mean accuracy and loss
            print('Train loss:', batch_loss, '\tValidation loss:', np.round(np.mean(valid_losses), 4), 'Accuracy:', np.round(np.mean(acc_values)*100,2), '%')
            
        # Validation tensorboard summary
        if it % summary_on_valid == 0:
            # Compute the accuracy and loss for every tasks (1-20th)
            for it_v in range(0,len(valid_stories_seq),1000):
                valid_summary = mrg_valid_summary.eval(feed_dict={X_story:valid_stories_seq[it_v:it_v+1000], X_question:valid_question_seq[it_v:it_v+1000], y:valid_answer_seq[it_v:it_v+1000]})
                file_writer.add_summary(valid_summary, it)

| Task | bAbl LSTM Baseline | Our Accuracy   |
|------|------|------|
|   1  | 50 % | 50 %     |
|   2  | 20 % | 40 % |
|   3  | 20 % | 38 % |
|   4  | 61 % | 68 % |
|   5  | 70 %  | 66 % |
|   6  | 48 %  | 72 % |
|   7  | 49 % | 75 % |
|   8  | 45 % | 66 % |
|   9  | 64 % | 73 % |
|   10  | 44 % | 63 % |
|   11  | 72 % | 71 % |
|   12  | 74 % | 66 % |
|   13  | 94 % | 93 % |
|   14  | 27 % | 40 % |
|   15  | 21 % | 49 % |
|   16  | 23 % | 47 % |
|   17  | 51 % | 61 % |
|   18  | 52 % | 91 % |
|   19  | 8 % | 9 % |
|   20  | 91 % | 94 % |
|  Mean | 49 % | 62 % |

## VALIDATION ACCURACY
![title](images/valid_acc1.png)
## VALIDATION LOSS
![title](images/valid_loss1.png)
## TRAIN LOSS
![title](images/train_loss1.png)

In [None]:
#C:\Users\Andrea\Jupyter notebook\NLP\Babl
#tensorboard --logdir tf_logs