In [1]:
import pickle
import numpy as np

In [2]:
with open('train_qa.txt', 'rb') as f:
    train_data = pickle.load(f)

In [3]:
with open('test_qa.txt', 'rb') as f:
    test_data = pickle.load(f)

In [4]:
type(test_data)

list

In [5]:
type(train_data)

list

In [6]:
len(train_data)

10000

In [7]:
len(test_data)

1000

In [8]:
train_data[0] # train data sentences followed with a questin and an answer

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

In [9]:
# story
' '.join(train_data[0][0])

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [10]:
# question
' '.join(train_data[0][1])

'Is Sandra in the hallway ?'

In [11]:
# answer
train_data[0][2]

'no'

In [12]:
# create a vocabulary
all_data = test_data + train_data

In [13]:
len(all_data)

11000

In [14]:
set(train_data[0][0]) # turn list into a set

{'.',
 'Mary',
 'Sandra',
 'bathroom',
 'bedroom',
 'journeyed',
 'moved',
 'the',
 'to'}

In [15]:
vocab = set()

for story, question, answer in all_data:
    # unite all sets into one set (it organize values and remove repeated ones)
    # https://www.programiz.com/python-programming/methods/set/union
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

In [16]:
vocab.add('no')

In [17]:
vocab.add('yes')

In [18]:
# set of all the possible vocab words
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [19]:
vocab_len = len(vocab) + 1 # (+ 1 is the placeholder that we will use later on, by deffect is 0)

In [20]:
vocab_len

38

In [21]:
# LONGEST STORY
# the all stories length
all_story_lens = [len(data[0]) for data in all_data]

In [22]:
max_story_len = max(all_story_lens) # get the max length

In [23]:
# LONGEST QUESTION
# get max question length too
all_question_lens = [len(data[1]) for data in all_data]
max_question_len = max(all_question_lens)

In [24]:
max_question_len

6

In [25]:
# part 2: Vectorizing stories
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [26]:
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [27]:
tokenizer.word_index

{'daniel': 1,
 'got': 2,
 'journeyed': 3,
 'took': 4,
 'garden': 5,
 'yes': 6,
 'to': 7,
 'bedroom': 8,
 '.': 9,
 'there': 10,
 'the': 11,
 '?': 12,
 'milk': 13,
 'apple': 14,
 'mary': 15,
 'in': 16,
 'back': 17,
 'dropped': 18,
 'john': 19,
 'sandra': 20,
 'put': 21,
 'went': 22,
 'down': 23,
 'hallway': 24,
 'left': 25,
 'no': 26,
 'grabbed': 27,
 'football': 28,
 'office': 29,
 'is': 30,
 'travelled': 31,
 'up': 32,
 'picked': 33,
 'discarded': 34,
 'bathroom': 35,
 'kitchen': 36,
 'moved': 37}

In [28]:
# tokenization for stories, questions and answers
train_story_text = []
train_question_text = []
train_answers = []

In [29]:
# separate each of them
for story, question, answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)

In [30]:
# turning each text's word into its matching word sequence (check tokenizer.word_index)
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [31]:
len(train_story_seq)

10000

In [32]:
len(train_story_text)

10000

In [33]:
train_story_seq # story vector

[[15, 37, 7, 11, 35, 9, 20, 3, 7, 11, 8, 9],
 [15,
  37,
  7,
  11,
  35,
  9,
  20,
  3,
  7,
  11,
  8,
  9,
  15,
  22,
  17,
  7,
  11,
  8,
  9,
  1,
  22,
  17,
  7,
  11,
  24,
  9],
 [15,
  37,
  7,
  11,
  35,
  9,
  20,
  3,
  7,
  11,
  8,
  9,
  15,
  22,
  17,
  7,
  11,
  8,
  9,
  1,
  22,
  17,
  7,
  11,
  24,
  9,
  20,
  22,
  7,
  11,
  36,
  9,
  1,
  22,
  17,
  7,
  11,
  35,
  9],
 [15,
  37,
  7,
  11,
  35,
  9,
  20,
  3,
  7,
  11,
  8,
  9,
  15,
  22,
  17,
  7,
  11,
  8,
  9,
  1,
  22,
  17,
  7,
  11,
  24,
  9,
  20,
  22,
  7,
  11,
  36,
  9,
  1,
  22,
  17,
  7,
  11,
  35,
  9,
  1,
  33,
  32,
  11,
  28,
  10,
  9,
  1,
  22,
  7,
  11,
  8,
  9],
 [15,
  37,
  7,
  11,
  35,
  9,
  20,
  3,
  7,
  11,
  8,
  9,
  15,
  22,
  17,
  7,
  11,
  8,
  9,
  1,
  22,
  17,
  7,
  11,
  24,
  9,
  20,
  22,
  7,
  11,
  36,
  9,
  1,
  22,
  17,
  7,
  11,
  35,
  9,
  1,
  33,
  32,
  11,
  28,
  10,
  9,
  1,
  22,
  7,
  11,
  8,
  9,
  19,
  31,
 

In [34]:
# create method to vectorize stories, questions and answers (same steps as before)
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len, max_question_len=max_question_len):
    
    # STORIES = X
    X = []
    # QUESTIONS Xq
    Xq = []
    # Y CORRECT ANSWER (yes/no)
    Y = []
    
    for story, query, answer in data:
        
        # for each story
        # [23, 14, ...]
        x = [word_index[word.lower()] for word in story]
        xq = [word_index[word.lower()] for word in query]
        
        y = np.zeros(len(word_index)+1)
        
        y[word_index[answer]] = 1
        
        X.append(x)
        Xq.append(xq)
        Y.append(y)
    
    return (pad_sequences(X, maxlen=max_story_len), pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

In [35]:
# now we have our data formatted (Vectorized) and we can use it to create our models
inputs_train, queries_train, answers_train = vectorize_stories(train_data)

In [36]:
inputs_test, queries_tests, answers_test = vectorize_stories(test_data)

In [37]:
inputs_test

array([[ 0,  0,  0, ..., 11,  8,  9],
       [ 0,  0,  0, ..., 11,  5,  9],
       [ 0,  0,  0, ..., 11,  5,  9],
       ...,
       [ 0,  0,  0, ..., 11, 14,  9],
       [ 0,  0,  0, ..., 11,  5,  9],
       [ 0,  0,  0, ..., 14, 10,  9]], dtype=int32)

In [38]:
answers_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [39]:
tokenizer.word_index['yes'] # position of the answer yes

6

In [40]:
tokenizer.word_index['no'] # position of the answer no

26

In [41]:
sum(answers_test) # here you can see the two previous indexes

array([  0.,   0.,   0.,   0.,   0.,   0., 497.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0., 503.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.])

In [42]:
sum(answers_test)[7]

0.0

In [43]:
sum(answers_test)[13]

0.0

In [44]:
# part 3: Build the Neural Network
    # Input Encoder M
    # Input Encoder C
    # Question Encoder
# Complete the Network

In [45]:
 from keras.models import Sequential,Model

In [46]:
from keras.layers.embeddings import Embedding

In [47]:
from keras.layers import Input,Activation,Dense,Permute,Dropout,add,dot,concatenate,LSTM

In [48]:
# PLACEHOLDERS shape=(max_story_len, batch_size): They will receive input later based on stories and questinos
# We will pass this to our encoders
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

In [49]:
# Create input encoders
# 1. Define vocabulary size (voca_len)
vocab_size = len(vocab) + 1

In [50]:
# INPUT ENCODER M: Get embedded to a sequence of vectors
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64))
# when bigger is the value, longer is the training
input_encoder_m.add(Dropout(0.5)) # 50% of the neurons will be turned off, that helps with over feeding

# OUTPUT
# (samples, story_maxlen, embedding_dim)

In [51]:
# INPUT ENCODER C
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size, output_dim=max_question_len))
input_encoder_c.add(Dropout(0.5))

# OUTPUT
# (samples, story_maxlen, max_question_len)

In [52]:
# QUESTION ENCODER
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,output_dim=64,input_length=max_question_len))
question_encoder.add(Dropout(0.3))

# OUTPUT
# (samples, query_maxlen, embedding_dim)

In [53]:
#  ENCODED <--- ENCODER(INPUT)
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [54]:
# Use a dot product to compute the match between the first input vector sequence and the query

In [55]:
# dot product computed match for the first input vector sequence and the query
match = dot([input_encoded_m, question_encoded], axes=(2,2))
match = Activation('softmax')(match)

In [58]:
# add this match matrix with the second input vector sequence
response = add([match, input_encoded_c])
response = Permute((2,1))(response) # to have an output of examples by querie max len, by story max len

In [59]:
# concatenate match matrix with the question vector sequence (Concatenate response with question)
answer = concatenate([response, question_encoded])

In [60]:
answer # The None is the batch_size that we did not define yet

<KerasTensor: shape=(None, 6, 220) dtype=float32 (created by layer 'concatenate')>

In [61]:
# BUILD OUR MODEL: Reduce our answer with our current Neural Network

In [62]:
answer = LSTM(32)(answer)

In [63]:
answer = Dropout(0.5)(answer)
# this outputs something in the form of samples by the vocab size 
# we should only see a marking for YES or NO, everything else is just a bunch of 0.
# Just as we did when we vectorized the answers
answer = Dense(vocab_size)(answer) # (samples, vocab_size) # YES/NO 0000

In [64]:
# output the probability distribution over the vocabulary
# turn YES and NO into 0 and 1
answer = Activation('softmax')(answer)

In [65]:
# build the final model: Passing the PLACEHOLDERS
model = Model([input_sequence, question], answer)

In [66]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [67]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 156)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 6)]          0                                            
__________________________________________________________________________________________________
sequential (Sequential)         (None, None, 64)     2432        input_1[0][0]                    
__________________________________________________________________________________________________
sequential_2 (Sequential)       (None, 6, 64)        2432        input_2[0][0]                    
______________________________________________________________________________________________