In [4]:
from __future__ import print_function
from functools import reduce
import re
import json
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np

from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import recurrent
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences

In [2]:
import nltk
import string
from math import log
from nltk.corpus import wordnet as wn
from collections import defaultdict, Counter
from nltk.corpus import stopwords
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

docs_corpus = []
with open("/Users/shimei/Documents/2018/Web_Search/homework/Final_Assignment/project_files/documents.json",'r') as docs:
    docs_corpus = json.load(docs)
            
train_corpus = []             
with open("/Users/shimei/Documents/2018/Web_Search/homework/Final_Assignment/project_files/training.json",'r') as training:
    train_corpus = json.load(training)
print("len(train_corpus)",len(train_corpus))

test_for_train = train_corpus[:10000]

dev_corpus = []             
with open("/Users/shimei/Documents/2018/Web_Search/homework/Final_Assignment/project_files/devel.json",'r') as devel:
    dev_corpus = json.load(devel)
    
test_for_del = dev_corpus[:1000]

test_corpus = []             
with open("/Users/shimei/Documents/2018/Web_Search/homework/Final_Assignment/project_files/testing.json",'r') as test:
    test_corpus = json.load(test)
    
#test_data = test_corpus[:10]

punc = string.punctuation
stopwordsAll = set(stopwords.words('english'))
    
def qestion_and_answer(question_id):
    question = train_corpus[question_id]['question']
    answer = train_corpus[question_id]['text']
    processed_question = word_tokenize(question)
    para_id = train_corpus[question_id]['answer_paragraph']
    docid = train_corpus[question_id]['docid']
    #print("processed_question",processed_question)
    #print("answer",answer)
    #print("para_id",para_id)
    return processed_question, answer, para_id, docid

#qestion_and_answer(0)  
    
def doc_to_story(para_id, docid):
    story = []
    doc = docs_corpus[docid]
    #print("doc",doc)
    para = " "
    for index, para_data in enumerate(doc['text']):
        if index == para_id:
            para = para_data
    #print("para",para)
    sents = sent_tokenize(para)
    #print("sents",sents)
    for sent in sents:
        tokens = word_tokenize(sent)
        #print("tokens",tokens)
        story.append(tokens)
    #print("story",story)
    return story
            
#doc_to_story(0)

#save [(story, question, answer)]
def prepare_data(train_corpus):
    final_data = []
    print(len(train_corpus))
    for i in range(len(train_corpus)):
        processed_question, answer, para_id, docid = qestion_and_answer(i)
        story = doc_to_story(para_id, docid)
        final_data.append((story, processed_question, answer))
        #print("final_data",final_data)
    return final_data
  
data = prepare_data(test_for_train)
print("len(data)", len(data))
'''
del_data = prepare_data(test_for_del)
print("len(del_data)",len(del_data))
'''

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    xs = []
    xqs = []
    ys = []
    for story, query, answer in data:
        story_list = []
        for sent in story:
            for w in sent:
                story_list.append(w)
        x = [word_idx[w]  for w in story_list]
        xq = [word_idx[w] for w in query]
        # let's not forget that index 0 is reserved
        y = np.zeros(len(word_idx) + 1)
        for token in word_tokenize(answer):
            y[word_idx[token]] = 1
        xs.append(x)
        xqs.append(xq)
        ys.append(y)
    return pad_sequences(xs, maxlen=story_maxlen), pad_sequences(xqs, maxlen=query_maxlen), np.array(ys)


def get_paragraph(docid,document_data):
    #get the paragraph that contains the answer
    for item in document_data:
        if item['docid'] == docid:
            document = item['text']
            break
    return document

def term_freqs(document):
    tfs = defaultdict(dict)
    tfs_forward = defaultdict(dict)
    doc_id = 0
    for sentence in document:
        for token in word_tokenize(sentence):
            if token not in stopwordsAll and token not in punc:  
                term = lemmatize(token.lower())
                tfs[term][doc_id] = tfs[term].get(doc_id, 0) + 1 
                tfs_forward[doc_id][term] = tfs[doc_id].get(term, 0) + 1 
        doc_id += 1
    return tfs,doc_id+1,tfs_forward

def get_okapibm25(tf, total_docment, documents):
    '''Calculate and return term weights based on okapibm25'''
    k1, b, k3 = 1.5, 0.5, 0
    okapibm25 = defaultdict(dict)

    # calculate average doc length 
    total = 0
    for d in documents:
        total += len(d)
    avg_doc_length = total/len(documents)*1.0

    for term, doc_list in tf.items():
        df = len(doc_list)
        for doc_id, freq in doc_list.items():
            # term occurences in query
            # qtf = question.count(term) # SEPCIAL 
            qtf = 1.2
            idf = log((total_docment-df+0.5) / df+0.5)
            tf_Dt = ((k1+1)*tf[term][doc_id]) / (k1*((1-b)+b*(len(documents[doc_id])/avg_doc_length) + tf[term][doc_id]))
            if qtf == 0:
                third = 0
            else:
                third = ((k3+1)*qtf) / (k3+qtf)
                okapibm25[term][doc_id] = idf*tf_Dt*third

    return okapibm25

#find top_k paragraph that may contain the answer
def get_top_k_document(tfidf,query,k,document):
    top_document_id = Counter()
    for token in word_tokenize(query):
        if token not in stopwordsAll:  
             term = lemmatizer.lemmatize(token.lower())
             term_tfidf = tfidf[term]
             for docid, weight in term_tfidf.items():
                 top_document_id[docid] += weight
    top_document_id = top_document_id.most_common(k)
    top_document = []
    for document_id,weight in top_document_id:
        top_document.append(document_id)
    return top_document

def prepare_test_data(test_corpus):
    final_data = []
    for item in test_data:
        question = item['question']
        docid = item['docid']
        processed_question = word_tokenize(question)
        document = get_paragraph(docid,docs_corpus)
        tfs,total_docment,tfs_forward = term_freqs(document)
        tfidf = get_okapibm25(tfs, total_docment,document)
        top_1 = get_top_k_document(tfidf,question,1,document)
        for item in top_1:
            print("top_1", top_1)
            story = doc_to_story(top_1, docid)
            print("story",story)
        final_data.append((story, processed_question, answer))
        #print("final_data",final_data)
    return 

def prepare_test_del(train_corpus):
    final_data = []
    print(len(train_corpus))
    for i in range(len(train_corpus)):
        story = []
        question = train_corpus[i]['question']
        processed_question, answer, para_id, docid = qestion_and_answer(i)
        document = get_paragraph(docid,docs_corpus)
        tfs,total_docment,tfs_forward = term_freqs(document)
        tfidf = get_okapibm25(tfs, total_docment,document)
        top_1 = get_top_k_document(tfidf,question,1,document)
        for item in top_1:
            #print("item", item)
            story = doc_to_story(item, docid)
            #print("story",story)
        final_data.append((story, processed_question, answer))
        #print("final_data",final_data)
    return final_data

del_data = prepare_data(test_for_del)
print("len(del_data)",len(del_data))

def get_vocab(data):
    vocab = set()
    for story, q, answer in data:
        list_words = []
        for sent in story:            
            list_words = list_words + sent 
        vocab_list = list_words + q + word_tokenize(answer)
        #print("vocab_list",vocab_list)
        vocab |= set(vocab_list)
    vocab = sorted(vocab)
    return vocab
vocab = get_vocab(data + del_data)
#print("vocab",vocab)
print("len(vocab)",len(vocab))



RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 32
EPOCHS = 40
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN,
                                                           EMBED_HIDDEN_SIZE,
                                                           SENT_HIDDEN_SIZE,
                                                           QUERY_HIDDEN_SIZE))

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
story_maxlen = max(map(len, (x for x, _, _ in data + del_data)))
query_maxlen = max(map(len, (x for _, x, _ in data + del_data)))

x, xq, y = vectorize_stories(data, word_idx, story_maxlen, query_maxlen)
tx, txq, ty = vectorize_stories(del_data, word_idx, story_maxlen, query_maxlen)

#print('vocab = {}'.format(vocab))
print('x.shape = {}'.format(x.shape))
print('xq.shape = {}'.format(xq.shape))
print('y.shape = {}'.format(y.shape))
print('story_maxlen, query_maxlen = {}, {}'.format(story_maxlen, query_maxlen))

print('Build model...')


sentence = layers.Input(shape=(story_maxlen,), dtype='int32')
encoded_sentence = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(sentence)
encoded_sentence = layers.Dropout(0.3)(encoded_sentence)

question = layers.Input(shape=(query_maxlen,), dtype='int32')
encoded_question = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(question)
encoded_question = layers.Dropout(0.3)(encoded_question)
encoded_question = RNN(EMBED_HIDDEN_SIZE)(encoded_question)
encoded_question = layers.RepeatVector(story_maxlen)(encoded_question)

merged = layers.add([encoded_sentence, encoded_question])
merged = RNN(EMBED_HIDDEN_SIZE)(merged)
merged = layers.Dropout(0.3)(merged)
preds = layers.Dense(vocab_size, activation='softmax')(merged)

model = Model([sentence, question], preds)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

print('Training')
model.fit([x, xq], y,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.05)
loss, acc = model.evaluate([tx, txq], ty,
                           batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))


len(train_corpus) 43379
10000
len(data) 10000
1000
len(del_data) 1000
len(vocab) 41855
RNN / Embed / Sent / Query = <class 'keras.layers.recurrent.LSTM'>, 50, 100, 100
x.shape = (10000, 21)
xq.shape = (10000, 60)
y.shape = (10000, 41856)
story_maxlen, query_maxlen = 21, 60
Build model...
Training
Train on 9500 samples, validate on 500 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss / test accuracy = 15.9259 / 0.0020
