# Imports and Settings

In [70]:
%matplotlib inline

import csv, json, string, re, time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk import word_tokenize

from chainer import Chain, Variable, Parameter
from chainer import iterators, optimizers
import chainer.initializers as I
import chainer.functions as F
import chainer.links as L

WORD_VECTOR_SIZE = 50
H_SIZE = 200

# Read Word Vectors

In [2]:
glove = {}
f = open('glove/glove.6B.' + str(WORD_VECTOR_SIZE) + 'd.txt', 'rb')
reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
for row in reader:
    key = row[0]
    vector = map(float, row[1:])
    glove[key] = np.array(vector, dtype=np.float32).reshape(1,-1)
len(glove)

400000

# Read Dataset

In [3]:
def text2vec(text):
    tokens = word_tokenize(text)
    textVec = []
    for tok in tokens:
        textVec.append(glove.get(tok, np.zeros((1,WORD_VECTOR_SIZE), dtype=np.float32)))
    return np.vstack(textVec)

def answerpos(context, answer, answer_start):
    start = len(word_tokenize(context[:answer_start]))
    ans_len = len(word_tokenize(answer))
    
    return start, start + ans_len - 1

In [4]:
train = []
for jsonRow in json.loads(open('dataset/train.json', 'rb').read()):
    for paragraph in jsonRow['paragraphs']: 
        ctxLen = len(paragraph['context'])
        ctxVec = text2vec(paragraph['context'])
        
        for qnaJson in paragraph['qas']:
            qnVec = text2vec(qnaJson['question'].lower())
            
            ansStart, ansEnd = answerpos(paragraph['context'], 
                                           qnaJson['answer']['text'], 
                                           qnaJson['answer']['answer_start'])
            
            train.append((ctxVec, qnVec, ansStart, ansEnd))
print len(train)

61379


In [23]:
test = []
for jsonRow in json.loads(open('dataset/test.json', 'rb').read()):
    for paragraph in jsonRow['paragraphs']: 
        data = {}
        data['context'] = paragraph['context']
        data['contextVec'] = text2vec(paragraph['context'])
        data['qna'] = []
        for qnaJson in paragraph['qas']:
            qna = {}
            qna['id'] = qnaJson['id']
            qna['question'] = qnaJson['question']
            qna['questionVec'] = text2vec(qnaJson['question'].lower())
            data['qna'].append(qna)
        test.append(data)

print len(test)                  

7984


# Define Network
* RNN Tutorial: http://docs.chainer.org/en/stable/tutorial/recurrentnet.html
* Training Tutorial: http://docs.chainer.org/en/stable/tutorial/train_loop.html
* Attention: https://machinelearningmastery.com/how-does-attention-work-in-encoder-decoder-recurrent-neural-networks/
* Pointer: http://fastml.com/introduction-to-pointer-networks/

In [59]:
class CoattentionEncoder(Chain):
    def __init__(self, wordvec_size, h_size):
        super(CoattentionEncoder, self).__init__()
        
        self.h_size = h_size
        
        with self.init_scope():
            self.ctxRU = L.LSTM(wordvec_size, h_size)

            self.qnRU = L.LSTM(wordvec_size, h_size)
            self.qnLinear = L.Linear(h_size, h_size)
            
            self.outFwd = L.LSTM(3*h_size, h_size)
            self.outBwd = L.LSTM(3*h_size, h_size)
            self.outLinear = L.Linear(2*h_size, h_size)
            
    def reset_state(self):
        self.ctxRU.reset_state()
        self.qnRU.reset_state()
        self.outFwd.reset_state()
        self.outBwd.reset_state()
            
    def __call__(self, context, question):
        # context representation
        D = self.ctxRU(context).T
        
        #question representation
        Q = self.qnRU(question).T
        
        #attention
        affinity = F.matmul(D.T, Q)
        A_Q = F.softmax(affinity)
        A_D = F.softmax(affinity.T)
        
        C_Q = F.matmul(D, A_Q)
        C_D = F.matmul(F.concat((Q, C_Q), axis=0), A_D)
        
        #output
        out_in = F.concat((D, C_D), axis=0).T        
        h_fwd = self.outFwd(out_in)
        h_bwd = self.outBwd(out_in[::-1])
    
        U = self.outLinear(F.concat((h_fwd, h_bwd)))
        return U

In [60]:
ctx = train[0][0]
qn = train[0][1]
encoder = CoattentionEncoder(WORD_VECTOR_SIZE, H_SIZE)

U = encoder(ctx, qn)
print U.shape
print U

(125, 200)
variable([[-0.01496673 -0.01889673 -0.00773166 ..., -0.03226301 -0.01829917
            0.03382658]
          [-0.01490557 -0.0423511  -0.01553484 ..., -0.03195731 -0.01306379
            0.01433437]
          [-0.00751606 -0.02998942 -0.00652336 ..., -0.02165518 -0.03221701
            0.02800097]
          ..., 
          [-0.02748947 -0.01415092  0.00182293 ..., -0.01300309 -0.02445512
            0.02127449]
          [-0.0307134  -0.0468216  -0.00205125 ...,  0.01821324 -0.01728526
            0.02641234]
          [-0.02064704 -0.0097578   0.00653811 ...,  0.00406266  0.00351155
            0.01039272]])


In [61]:
class Highway(Chain):
    def __init__(self, h_size):
        super(Highway, self).__init__()
                
        with self.init_scope():
            self.MLP = L.Linear(3*h_size, h_size, nobias=True)
            self.M1 = L.Linear(2*h_size, h_size)
            self.M2 = L.Linear(h_size, h_size)
            self.M3 = L.Linear(2*h_size, 1)
            
    def __call__(self, U, h, us, ue):
        r = F.tanh(self.MLP(F.hstack([h, us, ue])))
        m1 = self.M1(F.concat((U, F.broadcast_to(r, U.shape))))
        m2 = self.M2(m1)
        m3 = self.M3(F.concat((m1,m2)))
        
        return m3

In [62]:
highway = Highway(H_SIZE)

h = np.zeros((1,H_SIZE), dtype=np.float32)
us = U[0].reshape(1,-1)
ue = U[-1].reshape(1,-1)

alpha = highway(U, h, us, ue)
print alpha.shape

(125, 1)


In [63]:
class DynamicPointingDecoder(Chain):
    def __init__(self, h_size):
        super(DynamicPointingDecoder, self).__init__()
                
        with self.init_scope():
            self.dec_state = L.LSTM(2*h_size, h_size)
            self.HwayStart = Highway(h_size)
            self.HwayEnd = Highway(h_size)
            
    def reset_state(self):
        self.dec_state.reset_state()
            
    def __call__(self, U, us, ue):
        h = self.dec_state(F.concat((us,ue)))
        alpha = highway(U, h, us, ue)
        s = F.argmax(alpha).data
        beta = highway(U, h, U[s].reshape(1,-1), ue)
        
        return alpha, beta

In [64]:
decoder = DynamicPointingDecoder(H_SIZE)

us = U[0].reshape(1,-1)
ue = U[-1].reshape(1,-1)
alpha, beta = decoder(U, us, ue)
print alpha.shape, F.argmax(alpha)
print beta.shape, F.argmax(beta)

(125, 1) variable(68)
(125, 1) variable(68)


In [65]:
class SquadNet(Chain):
    def __init__(self, wordvec_size, h_size):
        super(SquadNet, self).__init__()
                
        with self.init_scope():
            self.encoder = CoattentionEncoder(wordvec_size, h_size)
            self.decoder = DynamicPointingDecoder(h_size)
            
    def reset_state(self):
        self.encoder.reset_state()
        self.decoder.reset_state()
            
    def __call__(self, ctx, qn):
        U = self.encoder(ctx, qn)
        
        start = 0
        end = -1
        for i in range(3):            
            us = U[start].reshape(1, -1)
            ue = U[end].reshape(1, -1)
            alpha, beta = self.decoder(U, us, ue)
            
            start = F.argmax(alpha).data
            end = F.argmax(beta).data
        return alpha.reshape(1, -1), beta.reshape(1, -1)

In [66]:
ctx = train[0][0]
qn = train[0][1]
model = SquadNet(WORD_VECTOR_SIZE, H_SIZE)
alpha. beta = model(ctx, qn)
print alpha.shape
print beta.shape

(125, 1)
(125, 1)


# Define Training Loop

In [75]:
opt = optimizers.Adam()
model = SquadNet(WORD_VECTOR_SIZE, H_SIZE)

opt.setup(model)

In [79]:
N_EPOCH = 2
PRINT_INTERVAL = 1000

len_train = len(train)
for epoch in range(N_EPOCH):
    print "Epoch", epoch + 1, "/", N_EPOCH
    startTime = time.time()
    
    interval_loss = 0
    interval_start = time.time()
    for i in range(len_train):
        try:
            ctx, qn, ans_start, ans_end = train[i]

            model.reset_state()
            pred_start, pred_end = model(ctx, qn)

            loss_start = F.softmax_cross_entropy(pred_start, np.array([ans_start], dtype=np.int32))
            loss_end = F.softmax_cross_entropy(pred_end, np.array([ans_end], dtype=np.int32))
            loss = loss_start + loss_end
            
            interval_loss += loss.data
            if i % PRINT_INTERVAL == 0:
                print i, "/", len_train, ":", interval_loss, "(" + str(time.time() - interval_start) + "s)"
                interval_loss = 0
                interval_start = time.time()

            model.cleargrads()
            loss.backward()

            opt.update()
        except IndexError as e:
            print "Error on index " + str(i) + ":", e
    print "Epoch completed in", time.time() - startTime, "seconds"


Epoch 1 / 2
0 / 61379 : 1.79932117462 (0.0678100585938s)
1000 / 61379 : 6862.71748328 (128.100099087s)
2000 / 61379 : 7859.78378582 (133.302760839s)
3000 / 61379 : 7555.04627228 (138.290160179s)
4000 / 61379 : 7009.47757578 (135.738970995s)
5000 / 61379 : 7217.49965477 (136.134599924s)
6000 / 61379 : 7363.99676085 (134.371236086s)
7000 / 61379 : 7102.01323843 (132.691280842s)
8000 / 61379 : 8033.76777077 (145.369439125s)
9000 / 61379 : 7496.97219038 (136.565308094s)
10000 / 61379 : 7653.9298625 (140.999716997s)
11000 / 61379 : 7049.08306122 (129.703157902s)
12000 / 61379 : 8271.8430419 (149.124598026s)
13000 / 61379 : 7726.87153149 (126.37846303s)
14000 / 61379 : 8001.32679319 (142.517619133s)
15000 / 61379 : 7800.00272036 (140.180574894s)
16000 / 61379 : 7467.89545536 (134.119723082s)
17000 / 61379 : 7262.11009264 (142.394720078s)
18000 / 61379 : 7366.69743681 (144.496595144s)
19000 / 61379 : 7503.78264952 (135.446516991s)
20000 / 61379 : 7247.93611956 (139.518861055s)
21000 / 61379 :

# Output Answers

In [77]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()
    
    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [78]:
f = open('pred.csv', 'wb')
out = csv.writer(f)
out.writerow(["Id", "Answer"])

for t in test:
    ctx = word_tokenize(t['context'])
    ctxVec = t['contextVec']
    
    for qna in t['qna']:
        id = qna['id']
        qn = qna['question']
        qnVec = qna['questionVec']
        
        model.reset_state()
        start, end = model(ctxVec, qnVec)
        start = F.argmax(start).data
        end = F.argmax(end).data
        
        ans = ""
        for i in range(start, end + 1):
            ans += ctx[i] + " "
        
        out.writerow([id, normalize_answer(ans).encode('utf-8')])

f.close()