# Notebook Memory Networks - Question Answering

### The dataset can be downloaded - https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz

In [102]:
# import the necessary packages

from keras.layers import Input
from keras.layers.core import Dense, Activation, Dropout, Permute
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.utils import np_utils
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences

import collections
import nltk
import numpy as np
import itertools
import matplotlib.pyplot as plt

%matplotlib inline

### Read the files to create train and test data

In [33]:
TRAIN_FILE = "/Users/tkmacl9/Desktop/FastAIDLCourse/nbs/datasets/FB_BABL_Ques_Answering/tasks_1-20_v1-2/en/qa1_single-supporting-fact_train.txt"
TEST_FILE = "/Users/tkmacl9/Desktop/FastAIDLCourse/nbs/datasets/FB_BABL_Ques_Answering/tasks_1-20_v1-2/en/qa1_single-supporting-fact_test.txt"

stories = []
ques = []
answers = []
def parseFile(infile):
    story = []
    # This parses each line and creates the stories, questions and answers tuple
    for line in open(infile):
        content = line.split(" ", 1)
#         print(content[1])
        if "\t" in line:
            stor_ques_ans = content[1].split("\t")
            ques.append(stor_ques_ans[0])
            answers.append(stor_ques_ans[1])
            stories.append(story)
            story = []
        else:
            story.append(content[1])
#     print(ques)
    return stories, ques, answers
    
train_data = parseFile(TRAIN_FILE)
test_data = parseFile(TEST_FILE)

In [39]:
# swap the subscript for the train_data and test_data to 0 or 1 or 2 to get the Stories, Questions or Answers

print(train_data[1])
print(" ")
print(test_data[1])

['Where is Mary? ', 'Where is Daniel? ', 'Where is Daniel? ', 'Where is Daniel? ', 'Where is Sandra? ', 'Where is Sandra? ', 'Where is Sandra? ', 'Where is Sandra? ', 'Where is John? ', 'Where is Daniel? ', 'Where is John? ', 'Where is Mary? ', 'Where is John? ', 'Where is John? ', 'Where is Sandra? ', 'Where is Daniel? ', 'Where is Sandra? ', 'Where is Sandra? ', 'Where is Sandra? ', 'Where is Daniel? ', 'Where is John? ', 'Where is Mary? ', 'Where is John? ', 'Where is Sandra? ', 'Where is John? ', 'Where is Daniel? ', 'Where is John? ', 'Where is Daniel? ', 'Where is John? ', 'Where is Daniel? ', 'Where is Mary? ', 'Where is Sandra? ', 'Where is Daniel? ', 'Where is John? ', 'Where is Daniel? ', 'Where is Mary? ', 'Where is John? ', 'Where is Daniel? ', 'Where is John? ', 'Where is Sandra? ', 'Where is Daniel? ', 'Where is Daniel? ', 'Where is Daniel? ', 'Where is Sandra? ', 'Where is Daniel? ', 'Where is Sandra? ', 'Where is Sandra? ', 'Where is John? ', 'Where is Sandra? ', 'Where

### Build Vocab from train and test data

In [80]:
# this method returns the word2index and index2word mappings 
word_counts = collections.Counter()

def buildVocab(train_data, test_data):
    # this loop runs through all the sentences , questions and answers and creates a counter with the counts for 
    # unique words and their counts.
    for stories, questions, answers in [train_data, test_data]:
        for sent in stories:
            for sen in sent:
                for word in nltk.word_tokenize(sen):
                    word_counts[word] += 1
        for ques in questions:
            for word in nltk.word_tokenize(ques):
                word_counts[word] += 1
        for ans in answers:
            for word in nltk.word_tokenize(ans):
                word_counts[word] += 1
#     print(len(word_counts))
#     print(" ")
#     print(word_counts)
    
    # now create the word2index and index2word maps
    i=0
    word2index = {w:i+1 for i,w in enumerate(word_counts.keys())}
    word2index[0] = "UNK"
#     print(" ")
#     print(len(word2index))
#     print(word2index)
    
    index2word = {v:k for k,v in word2index.items()}
#     print(" ")
#     print(len(index2word))
#     print(index2word)
    return word2index, index2word

word2index, index2word = buildVocab(train_data, test_data)
print(" ")
print(word2index)

vocab_size = len(word2index)
        

 
{'to': 1, 0: 'UNK', 'moved': 18, 'kitchen': 8, 'is': 3, '.': 13, 'bedroom': 14, 'Sandra': 16, 'office': 12, 'bathroom': 11, 'back': 19, 'garden': 20, 'John': 4, 'hallway': 5, 'Daniel': 6, '?': 2, 'travelled': 10, 'went': 15, 'Where': 7, 'the': 17, 'journeyed': 9, 'Mary': 21}


### Get Maximum Lengths for the story and Question

In [79]:
# this function finds the maximum length of a sentence (number of words in a sentence or questions)
def getMaXLength(train_data, test_data):
    s_maxlength =0
    q_maxlength=0
    for stories, questions, answers in [train_data, test_data]:
        for sent in stories:
            for sen in sent:
                new_length = len(nltk.word_tokenize(sen))
                if new_length > s_maxlength:
                    s_maxlength = new_length
        for ques in questions:
            new_length = len(nltk.word_tokenize(sen))
            if new_length > q_maxlength:
                q_maxlength = new_length
    
    return s_maxlength, q_maxlength

story_maxlength, ques_maxlength = getMaXLength(train_data, test_data)
print(story_maxlength," ", ques_maxlength)

7   6


### Vectorize the Inputs and Pad the sequences

In [105]:
# Vectorize the inputs by padding the data to the same sizes.
def vectorize(data, word2index, story_maxlength, ques_maxlength):
    Xs, Xq, Y =[], [], []
    k=0
    stories, questions, answers = data
    
#     print(stories)
#     print(" ")
#     print(questions)
#     print(" ")
#     print(answers)
#     print(" ")
    
    for story, ques, ans in zip(stories, questions, answers):
        
        # loop through each of the sentences, questions and snwers and append them to the list
        # their indexes
        xs = [[word2index[w] for w in nltk.word_tokenize(s)] for s in story]
        xs = list(itertools.chain.from_iterable(Xs))
        xq = [word2index[w] for w in nltk.word_tokenize(ques)]
        Xs.append(xs)
        Xq.append(xq)
        Y.append(word2index[ans])
#         Y = np_utils.to_categorical(Y_indexes)
    # pad the sequences for sentences and questions and change answers to categorical values    
    return pad_sequences(Xs, story_maxlength), pad_sequences(Xq, ques_maxlength), np_utils.to_categorical(Y)
    
Xstrain, Xqtrain, Ytrain = vectorize(train_data, word2index, story_maxlength, ques_maxlength)
Xstest, Xqtest, Ytest = vectorize(test_data, word2index, story_maxlength, ques_maxlength)

In [106]:
print(Xstrain.shape)
print(Xqtrain.shape)
print(Ytrain.shape)

print(" ")

print(Xstest.shape)
print(Xqtest.shape)
print(Ytest.shape)

(2000, 7)
(2000, 6)
(2000, 21)
 
(2000, 7)
(2000, 6)
(2000, 21)


### Define the Model Architecture

In [None]:
# check the complex model architecture and define the architecture
# the detailed architecture can be found from the paper - memory networks


### Fit the Model and Plot the Graphs