# Imports and Settings

In [28]:
%matplotlib inline

import csv, json, string, re, time

import numpy as np
import cupy
import pandas as pd
import matplotlib.pyplot as plt

from nltk import word_tokenize

import chainer
from chainer import Chain, Variable, Parameter
from chainer import iterators, optimizers
import chainer.initializers as I
import chainer.functions as F
import chainer.links as L

WORD_VECTOR_SIZE = 300
H_SIZE = 200
USE_GPU = True

# Read Word Vectors

In [2]:
glove = {}
f = open('glove/glove.6B.' + str(WORD_VECTOR_SIZE) + 'd.txt', 'rb')
reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
for row in reader:
    key = row[0]
    vector = map(float, row[1:])
    glove[key] = np.array(vector, dtype=np.float32).reshape(1,-1)
len(glove)

400000

# Read Dataset

In [10]:
def text2vec(text):
    tokens = word_tokenize(text)
    textVec = []
    for tok in tokens:
        textVec.append(glove.get(tok, np.zeros((1,WORD_VECTOR_SIZE), dtype=np.float32)))
    return textVec

def answerpos(context, answer, answer_start):
    start = len(word_tokenize(context[:answer_start]))
    ans_len = len(word_tokenize(answer))
    
    return start, start + ans_len - 1

In [11]:
train = []
for jsonRow in json.loads(open('dataset/train.json', 'rb').read()):
    for paragraph in jsonRow['paragraphs']: 
        ctxLen = len(paragraph['context'])
        ctxVec = text2vec(paragraph['context'])
        
        for qnaJson in paragraph['qas']:
            qnVec = text2vec(qnaJson['question'].lower())
            
            ansStart, ansEnd = answerpos(paragraph['context'], 
                                           qnaJson['answer']['text'], 
                                           qnaJson['answer']['answer_start'])
            
            train.append((ctxVec, qnVec, ansStart, ansEnd))
len_train = len(train)
print len(train)

61379


In [None]:
test = []
for jsonRow in json.loads(open('dataset/test.json', 'rb').read()):
    for paragraph in jsonRow['paragraphs']: 
        data = {}
        data['context'] = paragraph['context']
        data['contextVec'] = text2vec(paragraph['context'])
        data['qna'] = []
        for qnaJson in paragraph['qas']:
            qna = {}
            qna['id'] = qnaJson['id']
            qna['question'] = qnaJson['question']
            qna['questionVec'] = text2vec(qnaJson['question'].lower())
            data['qna'].append(qna)
        test.append(data)

print len(test)                  

# Define Network
* RNN Tutorial: http://docs.chainer.org/en/stable/tutorial/recurrentnet.html
* Training Tutorial: http://docs.chainer.org/en/stable/tutorial/train_loop.html
* Attention: https://machinelearningmastery.com/how-does-attention-work-in-encoder-decoder-recurrent-neural-networks/
* Pointer: http://fastml.com/introduction-to-pointer-networks/

In [15]:
class CoattentionEncoder(Chain):
    def __init__(self, wordvec_size, h_size, use_gpu=False):
        super(CoattentionEncoder, self).__init__()
        
        self.h_size = h_size
        self.use_gpu = use_gpu
        
        with self.init_scope():
            self.ctxRU = L.LSTM(wordvec_size, h_size)

            self.qnRU = L.LSTM(wordvec_size, h_size)
            self.qnLinear = L.Linear(h_size, h_size)
            
            self.outFwd = L.LSTM(3*h_size, h_size)
            self.outBwd = L.LSTM(3*h_size, h_size)
            self.outLinear = L.Linear(2*h_size, h_size)
            
            if use_gpu:
                print "CodynamicAttention uses GPU", self.use_gpu
                self.ctxRU.to_gpu()
                self.qnRU.to_gpu()
                self.qnLinear.to_gpu()
                self.outFwd.to_gpu()
                self.outBwd.to_gpu()
                self.outLinear.to_gpu()
            
    def reset_state(self):
        self.ctxRU.reset_state()
        self.qnRU.reset_state()
        self.outFwd.reset_state()
        self.outBwd.reset_state()
            
    def __call__(self, context, question):
        # context representation
        D = []
        for word in ctx:
            if self.use_gpu:
                word = Variable(word)
                word.to_gpu()
            D.append(self.ctxRU(word))
        D = F.vstack(D).T
        
        #question representation
        Q = []
        for word in qn:
            if self.use_gpu:
                word = Variable(word)
                word.to_gpu()
            Q.append(self.qnRU(word))
        Q = F.vstack(Q).T
        
        #attention
        affinity = F.matmul(D.T, Q)
        A_Q = F.softmax(affinity)
        A_D = F.softmax(affinity.T)
        
        C_Q = F.matmul(D, A_Q)
        C_D = F.matmul(F.concat((Q, C_Q), axis=0), A_D)
        
        #output
        out_in = F.concat((D, C_D), axis=0).T 
        h_fwd = []
        for fout in out_in:
            h_fwd.append(self.outFwd(fout.reshape(1,-1)))
        h_fwd = F.vstack(h_fwd)
            
        h_bwd = []
        for bout in out_in[::-1]:
            h_bwd.append(self.outBwd(bout.reshape(1,-1)))
        h_bwd = F.vstack(h_bwd)

        U = self.outLinear(F.concat((h_fwd, h_bwd)))
        return U

In [16]:
ctx = train[0][0]
qn = train[0][1]
encoder = CoattentionEncoder(WORD_VECTOR_SIZE, H_SIZE)

U = encoder(ctx, qn)
print U.shape
print U

(125, 200)
variable([[-0.01048665  0.00064355 -0.01285294 ...,  0.01555272 -0.03691786
           -0.02425362]
          [-0.0051848   0.0156891   0.00302843 ...,  0.03482651 -0.05149583
           -0.05101404]
          [ 0.00164901  0.02376289  0.02370892 ...,  0.03921724 -0.06495478
           -0.03598216]
          ..., 
          [ 0.23352769  0.12652838  0.07524005 ...,  0.05326129 -0.09351344
           -0.01281496]
          [ 0.23967445  0.11973716  0.05234898 ...,  0.03714702 -0.10802287
           -0.01751066]
          [ 0.24274108  0.11369845  0.04697984 ...,  0.01832231 -0.11674075
           -0.02041944]])


In [17]:
class Highway(Chain):
    def __init__(self, h_size, use_gpu=False):
        super(Highway, self).__init__()
        self.use_gpu = use_gpu
                
        with self.init_scope():
            self.MLP = L.Linear(3*h_size, h_size, nobias=True)
            self.M1 = L.Linear(2*h_size, h_size)
            self.M2 = L.Linear(h_size, h_size)
            self.M3 = L.Linear(2*h_size, 1)
            
            if use_gpu:
                print "Highway uses GPU", self.use_gpu
                self.MLP.to_gpu()
                self.M1.to_gpu()
                self.M2.to_gpu()
                self.M3.to_gpu()
            
    def __call__(self, U, h, us, ue):
        if self.use_gpu:
            U.to_gpu()
            h.to_gpu()
            us.to_gpu()
            ue.to_gpu()
        
        r = F.tanh(self.MLP(F.hstack([h, us, ue])))
        m1 = self.M1(F.concat((U, F.broadcast_to(r, U.shape))))
        m2 = self.M2(m1)
        m3 = self.M3(F.concat((m1,m2)))
        
        return m3

In [18]:
highway = Highway(H_SIZE)

h = Variable(np.zeros((1,H_SIZE), dtype=np.float32))
us = U[0].reshape(1,-1)
ue = U[-1].reshape(1,-1)

alpha = highway(U, h, us, ue)
print alpha.shape

(125, 1)


In [19]:
class DynamicPointingDecoder(Chain):
    def __init__(self, h_size, use_gpu=False):
        super(DynamicPointingDecoder, self).__init__()
        self.use_gpu = use_gpu
                
        with self.init_scope():
            self.dec_state = L.LSTM(2*h_size, h_size)
            self.HwayStart = Highway(h_size, use_gpu)
            self.HwayEnd = Highway(h_size, use_gpu)
            
            if self.use_gpu:
                print "DynamicPointincDecoded uses GPU", self.use_gpu
                self.dec_state.to_gpu()
                self.HwayStart.to_gpu()
                self.HwayEnd.to_gpu()
            
    def reset_state(self):
        self.dec_state.reset_state()
            
    def __call__(self, U, us, ue):
        if self.use_gpu:
            U.to_gpu()
            us.to_gpu()
            ue.to_gpu()
        
        h = self.dec_state(F.concat((us,ue)))
        alpha = self.HwayStart(U, h, us, ue)
        s = F.argmax(alpha).data
        beta = self.HwayEnd(U, h, U[s].reshape(1,-1), ue)
        
        return alpha, beta

In [20]:
decoder = DynamicPointingDecoder(H_SIZE)

us = U[0].reshape(1,-1)
ue = U[-1].reshape(1,-1)
alpha, beta = decoder(U, us, ue)
print alpha.shape, F.argmax(alpha)
print beta.shape, F.argmax(beta)

(125, 1) variable(0)
(125, 1) variable(94)


In [21]:
class SquadNet(Chain):
    def __init__(self, wordvec_size, h_size, use_gpu=False):
        super(SquadNet, self).__init__()
        self.use_gpu = use_gpu
                
        with self.init_scope():
            self.encoder = CoattentionEncoder(wordvec_size, h_size, use_gpu)
            self.decoder = DynamicPointingDecoder(h_size, use_gpu)
            
            if use_gpu:
                print "SquadNet uses GPU", self.use_gpu
                self.encoder.to_gpu()
                self.decoder.to_gpu()
            
    def reset_state(self):
        self.encoder.reset_state()
        self.decoder.reset_state()
            
    def __call__(self, ctx, qn):        
        U = self.encoder(ctx, qn)
        
        start = 0
        end = -1
        for i in range(3):            
            us = U[start].reshape(1, -1)
            ue = U[end].reshape(1, -1)
            alpha, beta = self.decoder(U, us, ue)
            
            start = F.argmax(alpha).data
            end = F.argmax(beta).data
        return alpha.reshape(1, -1), beta.reshape(1, -1)

In [22]:
model = SquadNet(WORD_VECTOR_SIZE, H_SIZE)
alpha. beta = model(ctx, qn)
print alpha.shape
print beta.shape

(125, 1)
(125, 1)


# Define Training Loop

In [29]:
opt = optimizers.Adam()
model = SquadNet(WORD_VECTOR_SIZE, H_SIZE, USE_GPU)
if USE_GPU:
    model.to_gpu()

opt.setup(model)

CodynamicAttention uses GPU True
Highway uses GPU True
Highway uses GPU True
DynamicPointincDecoded uses GPU True
SquadNet uses GPU True


In [None]:
def get_batch(i, batch_size):
    j = min(i + batch_size, len_train)
    
    ctx = []
    qn = []
    ans_start = []
    ans_end = []
    for k in range(i, j):
        c, q, s, e = train[k]
        ctx.append(c)
        qn.append(q)
        ans_start.append(s)
        ans_end.append(e)
    
    return np.vstack(ctx), np.vstack(qn), Variable(np.array(ans_start)), Variable(np.array(ans_end))

In [30]:
N_EPOCH = 1
BATCH_SIZE = 1

PRINT_INTERVAL = 1000

for epoch in range(N_EPOCH):
    print "Epoch", epoch + 1, "/", N_EPOCH
    startTime = time.time()
    
    interval_loss = 0
    interval_start = time.time()
    for i in range(0, len_train, BATCH_SIZE):
        try:
            ctx, qn, ans_start, ans_end = get_batch(i, BATCH_SIZE)
                        
            model.reset_state()
            pred_start, pred_end = model(ctx, qn)
            
            if USE_GPU:
                ans_start.to_gpu()
                ans_end.to_gpu()
            
            loss_start = F.softmax_cross_entropy(pred_start, ans_start)
            loss_end = F.softmax_cross_entropy(pred_end, ans_end)
            loss = loss_start + loss_end
            
            interval_loss += loss.data
            if i % PRINT_INTERVAL == 0:
                print i, "/", len_train, ":", interval_loss, "(" + str(time.time() - interval_start) + "s)"
                interval_loss = 0
                interval_start = time.time()

            model.cleargrads()
            loss.backward()

            opt.update()
        except IndexError as e:
            print "Error on index " + str(i) + ":", e
    print "Epoch completed in", time.time() - startTime, "seconds"


Epoch 1 / 1
0 / 61379 : 9.6827583313 (1.18510293961s)


KeyboardInterrupt: 

# Output Answers

In [None]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()
    
    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [None]:
f = open('pred.csv', 'wb')
out = csv.writer(f)
out.writerow(["Id", "Answer"])

for t in test:
    ctx = word_tokenize(t['context'])
    ctxVec = t['contextVec']
    
    for qna in t['qna']:
        id = qna['id']
        qn = qna['question']
        qnVec = qna['questionVec']
        
        model.reset_state()
        start, end = model(ctxVec, qnVec)
        start = F.argmax(start).data
        end = F.argmax(end).data
        
        ans = ""
        for i in range(start, end + 1):
            ans += ctx[i] + " "
        
        out.writerow([id, normalize_answer(ans).encode('utf-8')])

f.close()