In [1]:
import pickle
import numpy as np

In [2]:
with open('train_qa.txt', 'rb') as f:
    train_data = pickle.load(f)

In [3]:
len(train_data)

10000

In [4]:
print(len(train_data[1])) # each training data consist of a Story Question Answer therefore length of every example is 3
print(train_data[1])

3
(['Mary', 'moved', 'to', 'the', 'bathroom', '.', 'Sandra', 'journeyed', 'to', 'the', 'bedroom', '.', 'Mary', 'went', 'back', 'to', 'the', 'bedroom', '.', 'Daniel', 'went', 'back', 'to', 'the', 'hallway', '.'], ['Is', 'Daniel', 'in', 'the', 'bathroom', '?'], 'no')


In [5]:
a=' '.join(train_data[2][0])
a

'Mary moved to the bathroom . Sandra journeyed to the bedroom . Mary went back to the bedroom . Daniel went back to the hallway . Sandra went to the kitchen . Daniel went back to the bathroom .'

In [6]:
' '.join(train_data[2][1])

'Is Daniel in the office ?'

In [7]:
train_data[2][2]

'no'

In [8]:
vocab = set()
for story, question, answer in train_data:
    vocab = vocab.union(set(story)) #Set returns unique words in the sentence
                                    #Union returns the unique common elements from a two sets
    vocab = vocab.union(set(question))

In [9]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went'}

In [10]:
vocab.add('yes')
vocab.add('no')

In [100]:
vocab_len=len(vocab)+1

In [101]:
vocab_len

38

In [12]:
li_story_len = []
li_ques_len = []
for i in range(len(train_data)):
    
    story_len = 0
    story_len = len(train_data[i][0])
    li_story_len.append(story_len)
    
    ques_len = 0
    ques_len = len(train_data[i][1])
    li_ques_len.append(ques_len)

In [13]:
max_story_len = max(li_story_len)
max_ques_len = max(li_ques_len)
print("max story len = ",max_story_len)
print("max ques len = ",max_ques_len)

max story len =  156
max ques len =  6


In [14]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [102]:
tokenizer=Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)
word_index=tokenizer.word_index
print(word_index)

{'mary': 1, 'office': 2, 'the': 3, 'bathroom': 4, 'left': 5, 'picked': 6, 'took': 7, 'journeyed': 8, 'yes': 9, 'bedroom': 10, 'to': 11, 'hallway': 12, 'is': 13, 'there': 14, 'john': 15, 'put': 16, 'up': 17, 'dropped': 18, 'no': 19, 'down': 20, 'milk': 21, 'daniel': 22, 'travelled': 23, '?': 24, 'discarded': 25, 'sandra': 26, 'in': 27, '.': 28, 'football': 29, 'garden': 30, 'apple': 31, 'kitchen': 32, 'moved': 33, 'got': 34, 'grabbed': 35, 'back': 36, 'went': 37}


In [214]:
stories=[]
questions=[]
answers=[]
for i in range(len(train_data)):
    
    story_sequences = tokenizer.texts_to_sequences(train_data[i][0])
    story=[]
    for ele in story_sequences:
        story.extend(ele)
    stories.append(story)
    
    ques_sequences = tokenizer.texts_to_sequences(train_data[i][1])
    ques=[]
    for ele in ques_sequences:
        ques.extend(ele)
    questions.append(ques)
    
    ans = np.zeros(len(word_index)+1) #Index 0 Reserved when padding the sequences
    ans[word_index[train_data[i][2]]] = 1
    answers.append(ans)

X_story = pad_sequences(stories, maxlen=max_story_len)
X_ques = pad_sequences(questions, maxlen=max_ques_len)

In [215]:
X_ans=np.asarray(answers)
print(X_ans.shape)

(10000, 38)


In [212]:
print(X_story.shape)
print(X_ques.shape)
print(X_ans.shape)

(10000, 156)
(10000, 6)
(10000, 38)


In [21]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM

Using TensorFlow backend.


In [230]:
def model(max_story_len,max_ques_len,vocab_len):
    
    story = Input((max_story_len,))
    ques = Input((max_ques_len,))
    
    input_encoder_m = Sequential()
    input_encoder_m.add(Embedding(input_dim=vocab_len, output_dim=64))
    input_encoder_m.add(Dropout(0.3))
    
    input_encoder_c = Sequential()
    input_encoder_c.add(Embedding(input_dim=vocab_len, output_dim=max_ques_len))
    input_encoder_c.add(Dropout(0.3))
    
    ques_encoder = Sequential()
    ques_encoder.add(Embedding(input_dim=vocab_len, output_dim=64, input_length=max_ques_len))
    ques_encoder.add(Dropout(0.3))
    
    input_encoded_m = input_encoder_m(story)
    input_encoded_c = input_encoder_c(story)
    ques_encoded = ques_encoder(ques)
    
    match=dot([input_encoded_m,ques_encoded],axes=(2,2))
    match=Activation('softmax')(match)
    
    response = add([match,input_encoded_c])
    response = Permute((2,1))(response)
    
    answer = concatenate([response,ques_encoded])
    answer = LSTM(32)(answer)
    answer = Dropout(0.5)(answer)
    answer = Dense(vocab_len)(answer)
    answer = Activation('softmax')(answer)
    
    model = Model([story,ques],answer)
    
    return model

In [231]:
model = model(max_story_len, max_ques_len,vocab_len)

In [232]:
model.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           (None, 156)          0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           (None, 6)            0                                            
__________________________________________________________________________________________________
sequential_26 (Sequential)      multiple             2432        input_19[0][0]                   
__________________________________________________________________________________________________
sequential_28 (Sequential)      (None, 6, 64)        2432        input_20[0][0]                   
____________________________________________________________________________________________

In [233]:
model.compile(optimizer='rmsprop', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [234]:
model.fit([X_story,X_ques],X_ans, batch_size = 32, epochs = 100, validation_split=0.2)

Train on 8000 samples, validate on 2000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1e8aa7fe688>

In [235]:
loss_train,acc_train = model.evaluate([X_story,X_ques],X_ans)



In [238]:
print(loss_train)
print(acc_train)

0.20182947977781296
0.9239000082015991


In [330]:
predictions = model.predict([X_story, X_ques])

In [414]:
random_story='sandra is in bathroom . sandra is in my bedroom . '
random_ques='is sandra in my bedroom ?'
data=[random_story.split(),random_ques.split()]

In [415]:
stories=[]
questions=[]
answers=[]
story_sequences = tokenizer.texts_to_sequences(data[0])
story=[]
for ele in story_sequences:
    story.extend(ele)
stories.append(story)

ques_sequences = tokenizer.texts_to_sequences(data[1])
ques=[]
for ele in ques_sequences:
    ques.extend(ele)
questions.append(ques)

X_random_story = pad_sequences(stories, maxlen=max_story_len)
X_random_ques = pad_sequences(questions, maxlen=max_ques_len)

In [416]:
X_random_story.shape

(1, 156)

In [417]:
predictions_random = model.predict([X_random_story, X_random_ques])

In [418]:
predictions_random.shape

(1, 38)

In [419]:
predictions_random

array([[1.0145969e-12, 1.1770227e-12, 1.0679633e-12, 1.1394003e-12,
        1.1369321e-12, 1.1042479e-12, 1.1630371e-12, 1.1195896e-12,
        1.2375035e-12, 9.5696414e-01, 1.0485562e-12, 1.1885166e-12,
        1.0977924e-12, 1.2219249e-12, 1.1198971e-12, 1.1298781e-12,
        1.0074755e-12, 1.1843144e-12, 1.0360477e-12, 4.3035887e-02,
        1.0541169e-12, 1.2497096e-12, 1.2232845e-12, 1.2250426e-12,
        1.1373984e-12, 1.0906401e-12, 1.1104786e-12, 1.1363511e-12,
        1.0392182e-12, 1.0612826e-12, 1.1941816e-12, 1.2439973e-12,
        1.0503338e-12, 1.1493294e-12, 1.1664249e-12, 1.2374304e-12,
        9.9944456e-13, 1.0474629e-12]], dtype=float32)

In [420]:
for key,value in word_index.items() :
    if word_index[key]==np.argmax(predictions_random):
        print(key)

yes
