<a href="https://colab.research.google.com/github/anirbanghoshsbi/others/blob/master/Question_Anwering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from keras.layers import Input # for function API
from keras.layers.core import Activation , Dense , Dropout , Permute
from keras.layers.embeddings import Embedding
from keras.layers.merge import add , concatenate,dot
from keras.layers.recurrent import LSTM
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
import collections
import itertools
import nltk
import numpy as np
import os
import matplotlib.pyplot as plt

In [84]:
!ls

qa1_single-supporting-fact_test.txt   sample_data
qa1_single-supporting-fact_train.txt


In [0]:
TRAIN_FILE = "qa1_single-supporting-fact_train.txt"
TEST_FILE = "qa1_single-supporting-fact_test.txt"

def get_data(infile):
    stories, questions, answers = [], [], []
    story_text = []
    fin = open(TRAIN_FILE, "rb")
    for line in fin:
        line = line.decode("utf-8").strip()
        
        lno, text = line.split(" ", 1)
        #print(text)
        #story_text.append(text)
        if "\t" in text:
            question, answer,_= text.split("\t")
            stories.append(story_text)
            questions.append(question)
            answers.append(answer)
            story_text = []
        else:
            story_text.append(text)
    fin.close()
    return stories, questions, answers

data_train = get_data(TRAIN_FILE)
data_test = get_data(TEST_FILE)

In [86]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
def build_vocab(train_data, test_data):
    counter = collections.Counter()
    for stories,_,_ in [train_data, test_data]:
        for story in stories:
            for sent in story:
                for word in nltk.word_tokenize(sent):
                    counter[word.lower()] += 1
    for _,questions,_ in  [train_data, test_data]:                 
                for question in questions:
                    for word in nltk.word_tokenize(question):
                         counter[word.lower()] += 1
    for _,_,answers in  [train_data, test_data]:            
                for answer in answers:
                    for word in nltk.word_tokenize(answer):
                         counter[word.lower()] += 1
    word2idx = {w:(i+1) for i, (w, _) in enumerate(counter.most_common())}
    word2idx["PAD"] = 0
    idx2word = {v:k for k, v in word2idx.items()}
    return word2idx, idx2word

word2idx, idx2word = build_vocab(data_train, data_test)

vocab_size = len(word2idx)

In [88]:
vocab_size

22

In [0]:
def get_maxlens(train_data, test_data):
    story_maxlen, question_maxlen = 0, 0
    for stories,_, _ in [train_data, test_data]:
        for story in stories:
            story_len = 0
            for sent in story:
                swords = nltk.word_tokenize(sent)
                story_len += len(swords)
            if story_len > story_maxlen:
                story_maxlen = story_len
    for _,questions,_ in [train_data, test_data]:           
        for question in questions:
            question_len = len(nltk.word_tokenize(question))
            if question_len > question_maxlen:
                question_maxlen = question_len
    return story_maxlen, question_maxlen

story_maxlen, question_maxlen = get_maxlens(data_train, data_test)

In [90]:
data_train

([['Anirban moved to the bathroom.', 'John went to the hallway.'],
  ['Daniel went back to the hallway.', 'Sandra moved to the garden.'],
  ['John moved to the office.', 'Sandra journeyed to the bathroom.'],
  ['Anirban moved to the hallway.', 'Daniel travelled to the office.'],
  ['John went back to the garden.', 'John moved to the bedroom.'],
  ['Sandra travelled to the office.', 'Sandra went to the bathroom.'],
  ['Anirban went to the bedroom.', 'Daniel moved to the hallway.'],
  ['John went to the garden.', 'John travelled to the office.'],
  ['Daniel journeyed to the bedroom.', 'Daniel travelled to the hallway.'],
  ['John went to the bedroom.', 'John travelled to the office.'],
  ['Anirban went to the bedroom.', 'John journeyed to the bathroom.'],
  ['Sandra journeyed to the hallway.', 'John journeyed to the garden.'],
  ['John journeyed to the bathroom.', 'Sandra journeyed to the garden.'],
  ['Sandra went back to the bedroom.', 'Daniel travelled to the bathroom.'],
  ['John wen

In [0]:
def vectorize(data, word2idx, story_maxlen, question_maxlen):
    Xs, Xq, Y = [], [], []
    stories,questions,answers= data
    for story, question, answer in zip(stories, questions, answers):
        xs = [[word2idx[w.lower()] for w in nltk.word_tokenize(s)] 
                   for s in story]
        xs = list(itertools.chain.from_iterable(xs))
        xq = [word2idx[w.lower()] for w in nltk.word_tokenize(question)]
        Xs.append(xs)
        Xq.append(xq)
        Y.append(word2idx[answer.lower()])
    return pad_sequences(Xs, maxlen=story_maxlen),pad_sequences(Xq, maxlen=question_maxlen),np_utils.to_categorical(Y, num_classes=len(word2idx))

Xstrain, Xqtrain, Ytrain = vectorize(data_train, word2idx, story_maxlen, question_maxlen)
Xstest, Xqtest, Ytest = vectorize(data_test, word2idx, story_maxlen, question_maxlen)

In [92]:
EMBEDDING_SIZE = 64
LATENT_SIZE = 32

# inputs
story_input = Input(shape=(story_maxlen,))
question_input = Input(shape=(question_maxlen,))

# story encoder memory
story_encoder = Embedding(input_dim=vocab_size,
output_dim=EMBEDDING_SIZE,
    input_length=story_maxlen)(story_input)
story_encoder = Dropout(0.3)(story_encoder)

# question encoder
question_encoder = Embedding(input_dim=vocab_size,
output_dim=EMBEDDING_SIZE,
    input_length=question_maxlen)(question_input)
question_encoder = Dropout(0.3)(question_encoder)

# match between story and question
match = dot([story_encoder, question_encoder], axes=[2, 2])

# encode story into vector space of question
story_encoder_c = Embedding(input_dim=vocab_size,
output_dim=question_maxlen,
    input_length=story_maxlen)(story_input)
story_encoder_c = Dropout(0.3)(story_encoder_c)

# combine match and story vectors
response = add([match, story_encoder_c])
response = Permute((2, 1))(response)

# combine response and question vectors
answer = concatenate([response, question_encoder], axis=-1)
answer = LSTM(LATENT_SIZE)(answer)
answer = Dropout(0.3)(answer)
answer = Dense(vocab_size)(answer)
output = Activation("softmax")(answer)

model = Model(inputs=[story_input, question_input], outputs=output)
model.summary()
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 14)           0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 4)            0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 14, 64)       1408        input_5[0][0]                    
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 4, 64)        1408        input_6[0][0]                    
__________________________________________________________________________________________________
dropout_9 

In [93]:
BATCH_SIZE = 32
NUM_EPOCHS = 50
history = model.fit([Xstrain, Xqtrain], [Ytrain], batch_size=BATCH_SIZE,epochs=NUM_EPOCHS,validation_data=([Xstest, Xqtest], [Ytest]))

Train on 10000 samples, validate on 10000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [94]:
ytest = np.argmax(Ytest, axis=1)
Ytest_ = model.predict([Xstest, Xqtest])
ytest_ = np.argmax(Ytest_, axis=1)

for i in range(15):
    story = " ".join([idx2word[x] for x in Xstest[i].tolist() if x != 0])
    question = " ".join([idx2word[x] for x in Xqtest[i].tolist()])
    label = idx2word[ytest[i]]
    prediction = idx2word[ytest_[i]]
    print(story, question,'\nLABEL', label, '\nMODEL PREDICTION', prediction)

anirban moved to the bathroom . john went to the hallway . where is anirban ? 
LABEL bathroom 
MODEL PREDICTION bathroom
daniel went back to the hallway . sandra moved to the garden . where is daniel ? 
LABEL hallway 
MODEL PREDICTION hallway
john moved to the office . sandra journeyed to the bathroom . where is daniel ? 
LABEL hallway 
MODEL PREDICTION bedroom
anirban moved to the hallway . daniel travelled to the office . where is daniel ? 
LABEL office 
MODEL PREDICTION office
john went back to the garden . john moved to the bedroom . where is sandra ? 
LABEL bathroom 
MODEL PREDICTION office
sandra travelled to the office . sandra went to the bathroom . where is sandra ? 
LABEL bathroom 
MODEL PREDICTION bathroom
anirban went to the bedroom . daniel moved to the hallway . where is sandra ? 
LABEL bathroom 
MODEL PREDICTION office
john went to the garden . john travelled to the office . where is sandra ? 
LABEL bathroom 
MODEL PREDICTION garden
daniel journeyed to the bedroom . dani

In [82]:
#data1=[['Mary moved to the office'],['Where is Mary?'],['office']]
#Xstrain, Xqtrain, Ytrain = vectorize(data_train, word2idx, story_maxlen, question_maxlen)
Xstest, Xqtest, Ytest = vectorize(data1, word2idx, story_maxlen, question_maxlen)

KeyError: ignored