# Imports and Settings

In [1]:
%matplotlib inline

import csv, json, string, re, time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk import word_tokenize

import chainer
from chainer import Chain, Variable, Parameter
from chainer import iterators, optimizers, serializers
import chainer.initializers as I
import chainer.functions as F
import chainer.links as L

WORD_VECTOR_SIZE = 50
H_SIZE = 35
USE_GPU = False

# Read Word Vectors

In [2]:
glove = {}
f = open('glove/glove.6B.' + str(WORD_VECTOR_SIZE) + 'd.txt', 'rb')
reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
for row in reader:
    key = row[0]
    vector = map(float, row[1:])
    glove[key] = np.array(vector, dtype=np.float32).reshape(1,-1)
len(glove)

400000

# Read Dataset

In [23]:
def text2vec(text):
    tokens = word_tokenize(text.lower())
    textVec = np.array([])
    for tok in tokens:
        textVec = np.append(textVec, glove.get(tok, np.zeros((1,WORD_VECTOR_SIZE), dtype=np.float32)))
    return textVec.reshape(1, -1)

def answerpos(context, answer, answer_start):
    start = len(word_tokenize(context[:answer_start]))
    ans_len = len(word_tokenize(answer))
    
    return start, start + ans_len - 1

In [6]:
train = []
for jsonRow in json.loads(open('dataset/train.json', 'rb').read()):
    for paragraph in jsonRow['paragraphs']:
        ctxVec = text2vec(paragraph['context'])
        
        for qnaJson in paragraph['qas']:
            qnVec = text2vec(qnaJson['question']
            
            ansStart, ansEnd = answerpos(paragraph['context'], 
                                           qnaJson['answer']['text'], 
                                           qnaJson['answer']['answer_start'])
            
            train.append((ctxVec, qnVec, ansStart, ansEnd))

train = np.array(train)
train.shape

61379

In [21]:
shuf = np.random.permutation(len(train))
val = train[shuf[:6000]]
train = train[shuf[6000:]]

print val.shape, train.shape

(6000, 4) (55379, 4)


In [42]:
test = []
for jsonRow in json.loads(open('dataset/test.json', 'rb').read()):
    for paragraph in jsonRow['paragraphs']:
        ctx = paragraph['context']
        ctxVec = text2vec(paragraph['context'])
        
        for qnaJson in paragraph['qas']:
            qnId = qnaJson['id']
            qnVec = text2vec(qnaJson['question'])            
            test.append((ctxVec, qnVec, qnId, ctx))
            
test = np.array(test)
test.shape

(36790, 4)

In [31]:
def get_batch(i, batch_size, data):
    j = min(i + batch_size, len(data))
    
    ctx = []
    qn = []
    ans_start = []
    ans_end = []
    
    cmax = 0
    qmax = 0
    for k in range(i, j):
        c, q, s, e = data[k]
        ctx.append(c)
        qn.append(q)
        ans_start.append(s)
        ans_end.append(e)
        
        cmax = max(cmax, c.shape[1])
        qmax = max(cmax, q.shape[1])

    cVec = np.zeros((batch_size, cmax), dtype=np.float32)
    qVec = np.zeros((batch_size, qmax), dtype=np.float32)        
    for i in range(batch_size):
        cVec[i, 0:ctx[i].shape[1]] = ctx[i]
        qVec[i, 0:qn[i].shape[1]] = qn[i]
    
    return Variable(cVec), \
           Variable(qVec), \
           Variable(np.array(ans_start, dtype=np.int32)).reshape(-1,1), \
           Variable(np.array(ans_end, dtype=np.int32)).reshape(-1,1)



# Define Network
* RNN Tutorial: http://docs.chainer.org/en/stable/tutorial/recurrentnet.html
* Training Tutorial: http://docs.chainer.org/en/stable/tutorial/train_loop.html
* Attention: https://machinelearningmastery.com/how-does-attention-work-in-encoder-decoder-recurrent-neural-networks/
* Pointer: http://fastml.com/introduction-to-pointer-networks/

In [32]:
class CoattentionEncoder(Chain):
    def __init__(self, wordvec_size, h_size, use_gpu=False):
        super(CoattentionEncoder, self).__init__()
        
        self.h_size = h_size
        self.wordvec_size = wordvec_size
        self.use_gpu = use_gpu
        
        with self.init_scope():
            self.ctxRU = L.LSTM(wordvec_size, h_size)

            self.qnRU = L.LSTM(wordvec_size, h_size)
            self.qnLinear = L.Linear(h_size, h_size)
            
            self.outFwd = L.LSTM(3*h_size, h_size)
            self.outBwd = L.LSTM(3*h_size, h_size)
            self.outLinear = L.Linear(2*h_size, h_size)
            
            if use_gpu:
                print "CodynamicAttention uses GPU", self.use_gpu
                self.ctxRU.to_gpu()
                self.qnRU.to_gpu()
                self.qnLinear.to_gpu()
                self.outFwd.to_gpu()
                self.outBwd.to_gpu()
                self.outLinear.to_gpu()
            
    def reset_state(self):
        self.ctxRU.reset_state()
        self.qnRU.reset_state()
        self.outFwd.reset_state()
        self.outBwd.reset_state()
        
    def get_para_rep(self, para, ru):
        P = []
        for i in range(0, para.shape[1], self.wordvec_size):
            word = para[:, i:i+self.wordvec_size]
            if self.use_gpu: 
                word.to_gpu()
            P.append(ru(word))
        return F.transpose(F.dstack(P), (0, 1, 2))
            
    def __call__(self, ctx, qn):
        # context representation
        Ds = self.get_para_rep(ctx, self.ctxRU)
        
        #question representation
        Qs = self.get_para_rep(qn, self.qnRU)
        
        out_ins = []
        for i in range(Ds.shape[0]):
            D = Ds[i]
            Q = Qs[i]
            
            #attention
            affinity = F.matmul(D.T, Q)
            A_Q = F.softmax(affinity)
            A_D = F.softmax(affinity.T)

            C_Q = F.matmul(D, A_Q)
            C_D = F.matmul(F.concat((Q, C_Q), axis=0), A_D)
            
            out_ins.append(F.concat((D, C_D), axis=0).T)
        out_ins = F.transpose(F.dstack(out_ins), (0,2,1))

        #output
        h_fwd = []
        for fout in out_ins:
            h_fwd.append(self.outFwd(fout))
        h_fwd = F.dstack(h_fwd)

        h_bwd = []
        for bout in out_ins[::-1]:
            h_bwd.append(self.outBwd(bout))
        h_bwd = F.dstack(h_bwd)
        
        u_in = F.transpose(F.concat((h_fwd, h_bwd)), (0,2,1))
        U = self.outLinear(u_in.reshape(-1, 2*self.h_size))
        return U.reshape(Ds.shape[0], -1, self.h_size)

In [33]:
ctx, qn, ans_start, ans_end = get_batch(0, 5, train)

encoder = CoattentionEncoder(WORD_VECTOR_SIZE, H_SIZE)

U = encoder(ctx, qn)
print U.shape

(5, 207, 35)


In [34]:
class Highway(Chain):
    def __init__(self, h_size, use_gpu=False):
        super(Highway, self).__init__()
        
        self.h_size = h_size
        self.use_gpu = use_gpu
                
        with self.init_scope():
            self.MLP = L.Linear(3*h_size, h_size, nobias=True)
            self.M1 = L.Linear(2*h_size, h_size)
            self.M2 = L.Linear(h_size, h_size)
            self.M3 = L.Linear(2*h_size, 1)
            
            if use_gpu:
                print "Highway uses GPU", self.use_gpu
                self.MLP.to_gpu()
                self.M1.to_gpu()
                self.M2.to_gpu()
                self.M3.to_gpu()
            
    def __call__(self, U, h, us, ue):
        if self.use_gpu:
            U.to_gpu()
            h.to_gpu()
            us.to_gpu()
            ue.to_gpu()
        
        r = F.tanh(self.MLP(F.hstack([h, us, ue])))
        rs = []
        for i in range(U.shape[0]):
            rs.append(F.broadcast_to(r[i], U[i].shape))
        r = F.transpose(F.dstack(rs), (2,0,1))
        
        m_in = F.concat((U, r), axis=2).reshape(-1, 2*self.h_size)
        m1 = self.M1(m_in)
        m2 = self.M2(m1)
        m3 = self.M3(F.concat((m1,m2)))
        
        return m3.reshape(U.shape[0], -1, 1)

In [35]:
highway = Highway(H_SIZE)

h = Variable(np.zeros((5, H_SIZE), dtype=np.float32))
us = U[:,0].reshape(5, -1)
ue = U[:,-1].reshape(5, -1)

alpha = highway(U, h, us, ue)
print alpha.shape

(5, 207, 1)


In [36]:
class DynamicPointingDecoder(Chain):
    def __init__(self, h_size, use_gpu=False):
        super(DynamicPointingDecoder, self).__init__()
        self.use_gpu = use_gpu
                
        with self.init_scope():
            self.dec_state = L.LSTM(2*h_size, h_size)
            self.HwayStart = Highway(h_size, use_gpu)
            self.HwayEnd = Highway(h_size, use_gpu)
            
            if self.use_gpu:
                print "DynamicPointincDecoded uses GPU", self.use_gpu
                self.dec_state.to_gpu()
                self.HwayStart.to_gpu()
                self.HwayEnd.to_gpu()
            
    def reset_state(self):
        self.dec_state.reset_state()
            
    def __call__(self, U, us, ue):
        if self.use_gpu:
            U.to_gpu()
            us.to_gpu()
            ue.to_gpu()
        
        h = self.dec_state(F.concat((us,ue)))
        alpha = self.HwayStart(U, h, us, ue)
        s = F.argmax(alpha, axis=1).data.reshape(-1)
        beta = self.HwayEnd(U, h, U[range(U.shape[0]), s], ue)
        
        return alpha, beta

In [38]:
decoder = DynamicPointingDecoder(H_SIZE)

alpha, beta = decoder(U, us, ue)
print alpha.shape, F.argmax(alpha, axis=1).data.reshape(1,-1)
print beta.shape, F.argmax(beta, axis=1).data.reshape(1,-1)

(5, 207, 1) [[202 152 168   0 201]]
(5, 207, 1) [[ 1 79 65  0 54]]


In [39]:
class SquadNet(Chain):
    def __init__(self, wordvec_size, h_size, use_gpu=False):
        super(SquadNet, self).__init__()
        self.use_gpu = use_gpu
                
        with self.init_scope():
            self.encoder = CoattentionEncoder(wordvec_size, h_size, use_gpu)
            self.decoder = DynamicPointingDecoder(h_size, use_gpu)
            
            if use_gpu:
                print "SquadNet uses GPU", self.use_gpu
                self.encoder.to_gpu()
                self.decoder.to_gpu()
            
    def reset_state(self):
        self.encoder.reset_state()
        self.decoder.reset_state()
            
    def __call__(self, ctx, qn): 
        U = self.encoder(ctx, qn)
        
        start = np.zeros(U.shape[0], 'i')
        end = np.zeros(U.shape[0], 'i') - 1        
        for i in range(3):            
            us = U[range(U.shape[0]), start]
            ue = U[range(U.shape[0]), end]
            alpha, beta = self.decoder(U, us, ue)
            
            start = F.argmax(alpha, axis=1).data.reshape(-1)
            end = F.argmax(beta, axis=1).data.reshape(-1)
        return alpha, beta

In [40]:
model = SquadNet(WORD_VECTOR_SIZE, H_SIZE)
alpha, beta = model(ctx, qn)
print alpha.shape, F.argmax(alpha, axis=1).data.reshape(1,-1)
print beta.shape, F.argmax(beta, axis=1).data.reshape(1,-1)

(5, 207, 1) [[205 109 182  29 206]]
(5, 207, 1) [[ 78  78 155  45 108]]


# Create Model

In [41]:
opt = optimizers.Adam(alpha=1e-3)
model = SquadNet(WORD_VECTOR_SIZE, H_SIZE, USE_GPU)
if USE_GPU:
    model.to_gpu()
opt.setup(model)

# Define Training Loop

In [67]:
def train_model(model, opt, epoch_start, epoch_end, batch_size, print_interval):
    for epoch in range(epoch_start, epoch_end):
        print "Epoch", epoch + 1, "/", epoch_end
        startTime = time.time()
        epochScore = 0

        opt.new_epoch()
        
        interval_loss = 0
        interval_start = time.time()
        for i in range(0, len(train), batch_size):
            try:
                ctx, qn, ans_start, ans_end = get_batch(i, batch_size, train)
                if USE_GPU:
                    ans_start.to_gpu()
                    ans_end.to_gpu()

                model.reset_state()
                pred_start, pred_end = model(ctx, qn)

                loss_start = F.softmax_cross_entropy(pred_start, ans_start)
                loss_end = F.softmax_cross_entropy(pred_end, ans_end)
                loss = loss_start + loss_end

                interval_loss += loss.data
                if i % print_interval == 0:
                    print i, "/", len(train), ":", \
                          interval_loss, \
                          "(" + str(time.time() - interval_start) + "s)"
                    interval_loss = 0
                    interval_start = time.time()
                
                s = F.argmax(pred_start, axis=1).data
                e = F.argmax(pred_end, axis=1).data
                for j in range(s.shape[0]):
                    if s[j] == ans_start.data[j] and e[j] == ans_end.data[j]:
                        epochScore += 1

                model.cleargrads()
                loss.backward()

                opt.update()
            except IndexError as e:
                print "Error on train index " + str(i) + ":", e
        
        valLoss = 0
        valScore = 0
        for i in range(0, len(val), batch_size):
            try:
                ctx, qn, ans_start, ans_end = get_batch(i, batch_size, val)
                if USE_GPU:
                    ans_start.to_gpu()
                    ans_end.to_gpu()

                model.reset_state()
                pred_start, pred_end = model(ctx, qn)

                loss_start = F.softmax_cross_entropy(pred_start, ans_start)
                loss_end = F.softmax_cross_entropy(pred_end, ans_end)
                valLoss += loss_start + loss_end
                
                s = F.argmax(pred_start, axis=1).data
                e = F.argmax(pred_end, axis=1).data
                for j in range(s.shape[0]):
                    if s[j] == ans_start.data[j] and e[j] == ans_end.data[j]:
                        valScore += 1
            except IndexError as e:
                print "Error on val index " + str(i) + ":", e
        
        epochAcc = float(epochScore) / len(train)
        valAcc = float(valScore) / len(val)
        
        serializers.save_npz('gpu-epoch' + str(epoch+1) + '.model', model)
        print "Epoch completed in", time.time() - startTime, "seconds"
        print "Train Acc:", epochAcc, "Val Acc:", valAcc, "Val Loss:", valLoss      

# Train Model

In [70]:
train_model(model, opt, 0, 1, 50, 100)

Epoch 1 / 1
0 / 200 : 9.1655960083 (1.29957699776s)
100 / 200 : 18.3419818878 (11.838588953s)
Epoch completed in 34.8875339031 seconds
Train Acc: 0.02 Val Acc: 0.01 Val Loss: variable(9.223145484924316)
Epoch 2 / 1
0 / 200 : 9.01286506653 (1.32164788246s)
100 / 200 : 17.8375654221 (11.5460650921s)
Epoch completed in 33.1986849308 seconds
Train Acc: 0.01 Val Acc: 0.01 Val Loss: variable(9.216375350952148)


In [101]:
serializers.save_npz('one_epoch.model', model)

# Output Answers

In [73]:
def get_test_batch(i, batch_size, data):
    j = min(i + batch_size, len(data))
    
    ctx = []
    qn = []
    ids = []
    ctxStrs = []
    
    cmax = 0
    qmax = 0
    for k in range(i, j):
        c, q, s, e = data[k]
        ctx.append(c)
        qn.append(q)
        ids.append(s)
        ctxStrs.append(e)
        
        cmax = max(cmax, c.shape[1])
        qmax = max(cmax, q.shape[1])

    cVec = np.zeros((batch_size, cmax), dtype=np.float32)
    qVec = np.zeros((batch_size, qmax), dtype=np.float32)        
    for i in range(batch_size):
        cVec[i, 0:ctx[i].shape[1]] = ctx[i]
        qVec[i, 0:qn[i].shape[1]] = qn[i]
    
    return Variable(cVec), \
           Variable(qVec), \
           ids, \
           ctxStrs


In [71]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()
    
    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [None]:
serializers.load_npz('one_epoch.model', model)

In [82]:
test_batch_size = 100
test_print_interval = 1000

f = open('pred.csv', 'wb')
out = csv.writer(f)
out.writerow(["Id", "Answer"])

startTime = time.time()

for i in range(0, len(test), test_batch_size):
    ctx, qn, qnId, ctxStr = get_test_batch(i, test_batch_size, test)
    model.reset_state()
    start, end = model(ctx, qn)

    for j in range(len(qnId)):
        contextTokens = word_tokenize(ctxStr[j])

        s = F.argmax(start[j]).data
        e = F.argmax(end[j]).data
        
        s = min(s, len(contextTokens)-1)
        e = max(e, s)
        e = min(e, len(contextTokens)-1)        
        
        ans = ""
        for k in range(s, e + 1):
            ans += contextTokens[k] + " "
        
        out.writerow([qnId[j], normalize_answer(ans).encode('utf-8')])
    
    if i % test_print_interval == 0:
        print i, "/", len(test), "(" + str(time.time() - startTime) + "s)"
        startTime = time.time()
    
f.close()

0 / 200 (3.02272391319s)
