In [2]:
from __future__ import print_function
from functools import reduce
import re
import json
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np

from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import recurrent
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
import nltk
from nltk.corpus import wordnet as wn
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

docs_corpus = []
with open("/Users/shimei/Documents/2018/Web_Search/homework/Final_Assignment/project_files/documents.json",'r') as docs:
    docs_corpus = json.load(docs)
            
train_corpus = []             
with open("/Users/shimei/Documents/2018/Web_Search/homework/Final_Assignment/project_files/training.json",'r') as training:
    train_corpus = json.load(training)
print("len(train_corpus)",len(train_corpus))

test_for_train = train_corpus[:10000]

dev_corpus = []             
with open("/Users/shimei/Documents/2018/Web_Search/homework/Final_Assignment/project_files/devel.json",'r') as devel:
    dev_corpus = json.load(devel)
    
test_for_del = dev_corpus[:100]

    
def qestion_and_answer(question_id):
    question = train_corpus[question_id]['question']
    answer = train_corpus[question_id]['text']
    processed_question = word_tokenize(question)
    para_id = train_corpus[question_id]['answer_paragraph']
    #print("processed_question",processed_question)
    #print("answer",answer)
    #print("para_id",para_id)
    return processed_question, answer, para_id

#qestion_and_answer(0)  
    
def doc_to_story(para_id):
    story = []
    i = -1
    doc = docs_corpus[para_id]
    for para in doc['text']:
        sents = sent_tokenize(para)
        #print("sents",sents)
        for sent in sents:
            i +=1
            tokens = word_tokenize(sent)
            #print("tokens",tokens)
            story.append(tokens)
    #print("story",story)
    return story
            
#doc_to_story(0)

#save [(story, question, answer)]
def prepare_data(train_corpus):
    final_data = []
    print(len(train_corpus))
    for i in range(len(train_corpus)):
        processed_question, answer, docid = qestion_and_answer(i)
        story = doc_to_story(docid)
        final_data.append((story, processed_question, answer))
        #print("final_data",final_data)
    return final_data
    
data = prepare_data(test_for_train)
print("len(data)", len(data))

del_data = prepare_data(test_for_del)
print("len(del_data)",len(del_data))

'''
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma
'''
def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    xs = []
    xqs = []
    ys = []
    for story, query, answer in data:
        story_list = []
        for sent in story:
            for w in sent:
                story_list.append(w)
        x = [word_idx[w]  for w in story_list]
        xq = [word_idx[w] for w in query]
        # let's not forget that index 0 is reserved
        y = np.zeros(len(word_idx) + 1)
        for token in word_tokenize(answer):
            y[word_idx[token]] = 1
        xs.append(x)
        xqs.append(xq)
        ys.append(y)
    return pad_sequences(xs, maxlen=story_maxlen), pad_sequences(xqs, maxlen=query_maxlen), np.array(ys)


def get_vocab(data):
    vocab = set()
    for story, q, answer in data:
        list_words = []
        for sent in story:            
            list_words = list_words + sent 
        vocab_list = list_words + q + word_tokenize(answer)
        #print("vocab_list",vocab_list)
        vocab |= set(vocab_list)
    vocab = sorted(vocab)
    return vocab
vocab = get_vocab(data)
#print("vocab",vocab)
print("len(vocab)",len(vocab))

RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 32
EPOCHS = 40
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN,
                                                           EMBED_HIDDEN_SIZE,
                                                           SENT_HIDDEN_SIZE,
                                                           QUERY_HIDDEN_SIZE))

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
story_maxlen = max(map(len, (x for x, _, _ in data)))
query_maxlen = max(map(len, (x for _, x, _ in data)))

x, xq, y = vectorize_stories(data, word_idx, story_maxlen, query_maxlen)
tx, txq, ty = vectorize_stories(del_data, word_idx, story_maxlen, query_maxlen)

#print('vocab = {}'.format(vocab))
print('x.shape = {}'.format(x.shape))
print('xq.shape = {}'.format(xq.shape))
print('y.shape = {}'.format(y.shape))
print('story_maxlen, query_maxlen = {}, {}'.format(story_maxlen, query_maxlen))

print('Build model...')


sentence = layers.Input(shape=(story_maxlen,), dtype='int32')
encoded_sentence = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(sentence)
encoded_sentence = layers.Dropout(0.3)(encoded_sentence)

question = layers.Input(shape=(query_maxlen,), dtype='int32')
encoded_question = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(question)
encoded_question = layers.Dropout(0.3)(encoded_question)
encoded_question = RNN(EMBED_HIDDEN_SIZE)(encoded_question)
encoded_question = layers.RepeatVector(story_maxlen)(encoded_question)

merged = layers.add([encoded_sentence, encoded_question])
merged = RNN(EMBED_HIDDEN_SIZE)(merged)
merged = layers.Dropout(0.3)(merged)
preds = layers.Dense(vocab_size, activation='softmax')(merged)

model = Model([sentence, question], preds)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

print('Training')
model.fit([x, xq], y,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.05)
loss, acc = model.evaluate([tx, txq], ty,
                           batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))


len(train_corpus) 43379
10000
len(data) 10000
100
len(del_data) 100
len(vocab) 60415
RNN / Embed / Sent / Query = <class 'keras.layers.recurrent.LSTM'>, 50, 100, 100
x.shape = (10000, 581)
xq.shape = (10000, 60)
y.shape = (10000, 60416)
story_maxlen, query_maxlen = 581, 60
Build model...
Training
Train on 9500 samples, validate on 500 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss / test accuracy = 14.8538 / 0.0000
