In [None]:
import pickle
import numpy as np
from keras.models import Sequential,Model
from tensorflow import keras
from keras.layers import Embedding
from keras.layers import Input,Activation,Dense, Permute, Dropout, add, dot, concatenate, LSTM
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [None]:
with open('train_qa.txt', 'rb') as f:
    train_data = pickle.load(f)

with open('test_qa.txt', 'rb') as f:
    test_data = pickle.load(f)

In [None]:
type(test_data)

In [None]:
type(train_data)

In [None]:
len(test_data)

In [None]:
len(train_data)

In [None]:
train_data[0]

In [None]:
train_data[0][1]

In [None]:
train_data[0][0]

In [None]:
train_data[0][2]

In [None]:
all_data = test_data + train_data

In [None]:
len(all_data)

In [None]:
vocab = set()
for story, question, answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

In [None]:
vocab.add('no')

In [None]:
vocab.add('yes')

In [None]:
vocab_len = len(vocab) + 1

In [None]:
vocab_len

In [None]:
all_story_lens = [len(data[0]) for data in all_data]

In [None]:
max_story_len = max(all_story_lens)

In [None]:
max_story_len

In [None]:
max_question_len = max([len(data[1]) for data in all_data])

In [None]:
max_question_len

In [None]:
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [None]:
tokenizer.word_index

In [None]:
train_story_text = []
train_question_text = []
train_answers = []

In [None]:
for story, question, answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)

In [None]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [None]:
len(train_story_seq)

In [None]:
def vectorize_stories(data,word_index=tokenizer.word_index,max_story_len=max_story_len,max_question_len=max_question_len):
    #Stories
    X = []
    # Question
    Xq = []
    # Y question Answer (yes/no)
    Y = []
    
    for story,query,answer in data:
        # for each story
        # [23,16,......]
        x = [word_index[word.lower()] for word in story]
        xq = [word_index[word.lower()] for word in query]
        y = np.zeros(len(word_index)+1)
        y[word_index[answer]] = 1
        X.append(x)
        Xq.append(xq)
        Y.append(y)
    
    return(pad_sequences(X,maxlen=max_story_len),pad_sequences(Xq,maxlen=max_question_len),np.array(Y))

In [None]:
inputs_train, queries_train, answer_train = vectorize_stories(train_data)

In [None]:
input_test, queries_test, answer_test = vectorize_stories(test_data)

In [None]:
inputs_train

In [None]:
input_test

In [None]:
queries_train

In [None]:
queries_test

In [None]:
answer_train

In [None]:
tokenizer.word_index['yes']

In [None]:
tokenizer.word_index['no']

In [None]:
input_sequence=Input((max_story_len,))
question = Input((max_question_len,))

In [None]:
vocab_size=len(vocab)+1

In [None]:
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,output_dim=64))
input_encoder_m.add(Dropout(0.3))

In [None]:
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))

In [None]:
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,output_dim=64,input_length=max_question_len))
question_encoder.add(Dropout(0.3))

In [None]:
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [None]:
match = dot([input_encoded_m,question_encoded], axes=(2,2))
match = Activation('softmax')(match)

In [None]:
response = add([match,input_encoded_c])
response = Permute((2,1))(response)

In [None]:
answer = concatenate([response,question_encoded])

In [None]:
answer = LSTM(32)(answer)

In [None]:
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer)

In [None]:
answer = Activation('softmax')(answer)

In [None]:
model = Model([input_sequence,question], answer)

In [None]:
model.compile(optimizer = 'rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit([inputs_train,queries_train],answer_train,batch_size=32,epochs=25,validation_data=([input_test,queries_test],answer_test))

In [None]:
import matplotlib.pyplot as plt
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

plt.savefig('screenshot.png',bbox_inches='tight')

In [None]:
pred_results = model.predict(([input_test,queries_test]))

In [None]:
test_data[0][0]

In [None]:
test_data[0][1]

In [None]:
val_max = np.argmax(pred_results[0])

In [None]:
for key, val in tokenizer.word_index.items():
    if val==val_max:
        k=key

In [None]:
k