# Imports and Settings

In [19]:
%matplotlib inline

import csv, json, string, re, time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk import word_tokenize

import chainer
from chainer import Chain, Variable, Parameter
from chainer import iterators, optimizers, serializers
import chainer.initializers as I
import chainer.functions as F
import chainer.links as L

WORD_VECTOR_SIZE = 300
H_SIZE = 200
USE_GPU = True

# Read Word Vectors

In [3]:
glove = {}
f = open('glove/glove.6B.' + str(WORD_VECTOR_SIZE) + 'd.txt', 'rb')
reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
for row in reader:
    key = row[0]
    vector = map(float, row[1:])
    glove[key] = np.array(vector, dtype=np.float32).reshape(1,-1)
len(glove)

400000

# Read Dataset

In [4]:
def text2vec(text):
    tokens = word_tokenize(text)
    textVec = np.array([])
    for tok in tokens:
        textVec = np.append(textVec, glove.get(tok, np.zeros((1,WORD_VECTOR_SIZE), dtype=np.float32)))
    return textVec.reshape(1, -1)

def answerpos(context, answer, answer_start):
    start = len(word_tokenize(context[:answer_start]))
    ans_len = len(word_tokenize(answer))
    
    return start, start + ans_len - 1

In [5]:
train = []
for jsonRow in json.loads(open('dataset/train.json', 'rb').read()):
    for paragraph in jsonRow['paragraphs']: 
        ctxLen = len(paragraph['context'])
        ctxVec = text2vec(paragraph['context'])
        
        for qnaJson in paragraph['qas']:
            qnVec = text2vec(qnaJson['question'].lower())
            
            ansStart, ansEnd = answerpos(paragraph['context'], 
                                           qnaJson['answer']['text'], 
                                           qnaJson['answer']['answer_start'])
            
            train.append((ctxVec, qnVec, ansStart, ansEnd))
            
len_train = len(train)
print len_train

61379


In [6]:
test = []
for jsonRow in json.loads(open('dataset/test.json', 'rb').read()):
    for paragraph in jsonRow['paragraphs']: 
        data = {}
        data['context'] = paragraph['context']
        data['contextVec'] = text2vec(paragraph['context'])
        data['qna'] = []
        for qnaJson in paragraph['qas']:
            qna = {}
            qna['id'] = qnaJson['id']
            qna['question'] = qnaJson['question']
            qna['questionVec'] = text2vec(qnaJson['question'].lower())
            data['qna'].append(qna)
        test.append(data)

print len(test)                  

7984


In [7]:
def get_batch(i, batch_size):
    j = min(i + batch_size, len_train)
    
    ctx = []
    qn = []
    ans_start = []
    ans_end = []
    
    cmax = 0
    qmax = 0
    for k in range(i, j):
        c, q, s, e = train[k]
        ctx.append(c)
        qn.append(q)
        ans_start.append(s)
        ans_end.append(e)
        
        cmax = max(cmax, c.shape[1])
        qmax = max(cmax, q.shape[1])
        
    cVec = np.zeros((batch_size, cmax), dtype=np.float32)
    qVec = np.zeros((batch_size, qmax), dtype=np.float32)        
    for i in range(batch_size):
        cVec[i, 0:ctx[i].shape[1]] = ctx[i]
        qVec[i, 0:qn[i].shape[1]] = qn[i]
    
    return Variable(cVec), \
           Variable(qVec), \
           Variable(np.array(ans_start, dtype=np.int32)).reshape(-1,1), \
           Variable(np.array(ans_end, dtype=np.int32)).reshape(-1,1)

# Define Network
* RNN Tutorial: http://docs.chainer.org/en/stable/tutorial/recurrentnet.html
* Training Tutorial: http://docs.chainer.org/en/stable/tutorial/train_loop.html
* Attention: https://machinelearningmastery.com/how-does-attention-work-in-encoder-decoder-recurrent-neural-networks/
* Pointer: http://fastml.com/introduction-to-pointer-networks/

In [8]:
class CoattentionEncoder(Chain):
    def __init__(self, wordvec_size, h_size, use_gpu=False):
        super(CoattentionEncoder, self).__init__()
        
        self.h_size = h_size
        self.wordvec_size = wordvec_size
        self.use_gpu = use_gpu
        
        with self.init_scope():
            self.ctxRU = L.LSTM(wordvec_size, h_size)

            self.qnRU = L.LSTM(wordvec_size, h_size)
            self.qnLinear = L.Linear(h_size, h_size)
            
            self.outFwd = L.LSTM(3*h_size, h_size)
            self.outBwd = L.LSTM(3*h_size, h_size)
            self.outLinear = L.Linear(2*h_size, h_size)
            
            if use_gpu:
                print "CodynamicAttention uses GPU", self.use_gpu
                self.ctxRU.to_gpu()
                self.qnRU.to_gpu()
                self.qnLinear.to_gpu()
                self.outFwd.to_gpu()
                self.outBwd.to_gpu()
                self.outLinear.to_gpu()
            
    def reset_state(self):
        self.ctxRU.reset_state()
        self.qnRU.reset_state()
        self.outFwd.reset_state()
        self.outBwd.reset_state()
        
    def get_para_rep(self, para, ru):
        P = []
        for i in range(0, para.shape[1], self.wordvec_size):
            word = para[:, i:i+self.wordvec_size]
            if self.use_gpu: 
                word.to_gpu()
            P.append(ru(word))
        return F.transpose(F.dstack(P), (0, 1, 2))
            
    def __call__(self, ctx, qn):
        # context representation
        Ds = self.get_para_rep(ctx, self.ctxRU)
        
        #question representation
        Qs = self.get_para_rep(qn, self.qnRU)
        
        out_ins = []
        for i in range(Ds.shape[0]):
            D = Ds[i]
            Q = Qs[i]
            
            #attention
            affinity = F.matmul(D.T, Q)
            A_Q = F.softmax(affinity)
            A_D = F.softmax(affinity.T)

            C_Q = F.matmul(D, A_Q)
            C_D = F.matmul(F.concat((Q, C_Q), axis=0), A_D)
            
            out_ins.append(F.concat((D, C_D), axis=0).T)
        out_ins = F.transpose(F.dstack(out_ins), (0,2,1))

        #output
        h_fwd = []
        for fout in out_ins:
            h_fwd.append(self.outFwd(fout))
        h_fwd = F.dstack(h_fwd)

        h_bwd = []
        for bout in out_ins[::-1]:
            h_bwd.append(self.outBwd(bout))
        h_bwd = F.dstack(h_bwd)
        
        u_in = F.transpose(F.concat((h_fwd, h_bwd)), (0,2,1))
        U = self.outLinear(u_in.reshape(-1, 2*self.h_size))
        return U.reshape(Ds.shape[0], -1, self.h_size)

In [9]:
ctx, qn, ans_start, ans_end = get_batch(0, 5)

encoder = CoattentionEncoder(WORD_VECTOR_SIZE, H_SIZE)

U = encoder(ctx, qn)
print U.shape

(5, 125, 200)


In [10]:
class Highway(Chain):
    def __init__(self, h_size, use_gpu=False):
        super(Highway, self).__init__()
        
        self.h_size = h_size
        self.use_gpu = use_gpu
                
        with self.init_scope():
            self.MLP = L.Linear(3*h_size, h_size, nobias=True)
            self.M1 = L.Linear(2*h_size, h_size)
            self.M2 = L.Linear(h_size, h_size)
            self.M3 = L.Linear(2*h_size, 1)
            
            if use_gpu:
                print "Highway uses GPU", self.use_gpu
                self.MLP.to_gpu()
                self.M1.to_gpu()
                self.M2.to_gpu()
                self.M3.to_gpu()
            
    def __call__(self, U, h, us, ue):
        if self.use_gpu:
            U.to_gpu()
            h.to_gpu()
            us.to_gpu()
            ue.to_gpu()
        
        r = F.tanh(self.MLP(F.hstack([h, us, ue])))
        rs = []
        for i in range(U.shape[0]):
            rs.append(F.broadcast_to(r[i], U[i].shape))
        r = F.transpose(F.dstack(rs), (2,0,1))
        
        m_in = F.concat((U, r), axis=2).reshape(-1, 2*self.h_size)
        m1 = self.M1(m_in)
        m2 = self.M2(m1)
        m3 = self.M3(F.concat((m1,m2)))
        
        return m3.reshape(U.shape[0], -1, 1)

In [11]:
highway = Highway(H_SIZE)

h = Variable(np.zeros((5, H_SIZE), dtype=np.float32))
us = U[:,0].reshape(5, -1)
ue = U[:,-1].reshape(5, -1)

alpha = highway(U, h, us, ue)
print alpha.shape

(5, 125, 1)


In [12]:
class DynamicPointingDecoder(Chain):
    def __init__(self, h_size, use_gpu=False):
        super(DynamicPointingDecoder, self).__init__()
        self.use_gpu = use_gpu
                
        with self.init_scope():
            self.dec_state = L.LSTM(2*h_size, h_size)
            self.HwayStart = Highway(h_size, use_gpu)
            self.HwayEnd = Highway(h_size, use_gpu)
            
            if self.use_gpu:
                print "DynamicPointincDecoded uses GPU", self.use_gpu
                self.dec_state.to_gpu()
                self.HwayStart.to_gpu()
                self.HwayEnd.to_gpu()
            
    def reset_state(self):
        self.dec_state.reset_state()
            
    def __call__(self, U, us, ue):
        if self.use_gpu:
            U.to_gpu()
            us.to_gpu()
            ue.to_gpu()
        
        h = self.dec_state(F.concat((us,ue)))
        alpha = self.HwayStart(U, h, us, ue)
        s = F.argmax(alpha, axis=1).data.reshape(-1)
        beta = self.HwayEnd(U, h, U[range(U.shape[0]), s], ue)
        
        return alpha, beta

In [13]:
decoder = DynamicPointingDecoder(H_SIZE)

alpha, beta = decoder(U, us, ue)
print alpha.shape, F.argmax(alpha, axis=1).data.reshape(1,-1)
print beta.shape, F.argmax(beta, axis=1).data.reshape(1,-1)

(5, 125, 1) [[0 0 0 0 0]]
(5, 125, 1) [[123 123 123 123 123]]


In [14]:
class SquadNet(Chain):
    def __init__(self, wordvec_size, h_size, use_gpu=False):
        super(SquadNet, self).__init__()
        self.use_gpu = use_gpu
                
        with self.init_scope():
            self.encoder = CoattentionEncoder(wordvec_size, h_size, use_gpu)
            self.decoder = DynamicPointingDecoder(h_size, use_gpu)
            
            if use_gpu:
                print "SquadNet uses GPU", self.use_gpu
                self.encoder.to_gpu()
                self.decoder.to_gpu()
            
    def reset_state(self):
        self.encoder.reset_state()
        self.decoder.reset_state()
            
    def __call__(self, ctx, qn): 
        U = self.encoder(ctx, qn)
        
        start = np.zeros(U.shape[0], 'i')
        end = np.zeros(U.shape[0], 'i') - 1        
        for i in range(3):            
            us = U[range(U.shape[0]), start]
            ue = U[range(U.shape[0]), end]
            alpha, beta = self.decoder(U, us, ue)
            
            start = F.argmax(alpha, axis=1).data.reshape(-1)
            end = F.argmax(beta, axis=1).data.reshape(-1)
        return alpha, beta

In [15]:
model = SquadNet(WORD_VECTOR_SIZE, H_SIZE)
alpha, beta = model(ctx, qn)
print alpha.shape, F.argmax(alpha, axis=1).data.reshape(1,-1)
print beta.shape, F.argmax(beta, axis=1).data.reshape(1,-1)

(5, 125, 1) [[108 108 108 108 108]]
(5, 125, 1) [[51 51 51 51 51]]


# Create Model

In [16]:
opt = optimizers.Adam(alpha=1e-3)
model = SquadNet(WORD_VECTOR_SIZE, H_SIZE, USE_GPU)
if USE_GPU:
    model.to_gpu()
opt.setup(model)

CodynamicAttention uses GPU True
Highway uses GPU True
Highway uses GPU True
DynamicPointincDecoded uses GPU True
SquadNet uses GPU True


# Define Training Loop

In [17]:
def train_model(model, opt, epoch_start, epoch_end, batch_size, print_interval):
    for epoch in range(epoch_start, epoch_end+1):
        print "Epoch", epoch + 1, "/", n_epoch
        startTime = time.time()

        opt.new_epoch()
        
        interval_loss = 0
        interval_start = time.time()
        for i in range(0, len_train, batch_size):
            try:
                ctx, qn, ans_start, ans_end = get_batch(i, batch_size)
                if USE_GPU:
                    ans_start.to_gpu()
                    ans_end.to_gpu()

                model.reset_state()
                pred_start, pred_end = model(ctx, qn)

                loss_start = F.softmax_cross_entropy(pred_start, ans_start)
                loss_end = F.softmax_cross_entropy(pred_end, ans_end)
                loss = loss_start + loss_end

                interval_loss += loss.data
                if i % print_interval == 0:
                    print i, "/", len_train, ":", \
                          interval_loss, \
                          "(" + str(time.time() - interval_start) + "s)"
                    interval_loss = 0
                    interval_start = time.time()

                model.cleargrads()
                loss.backward()

                opt.update()
            except IndexError as e:
                print "Error on index " + str(i) + ":", e
        print "Epoch completed in", time.time() - startTime, "seconds"
        serializers.save_npz('gpu-epoch' + str(epoch) + '.model', model)


# Train Model

In [18]:
train_model(model, opt, 1, 5, 100, 1000)

Epoch 1 / 5
0 / 61379 : 10.5406913757 (2.26723814011s)
1000 / 61379 : 98.0884552002 (46.7571570873s)
2000 / 61379 : 96.8072509766 (44.9427349567s)
3000 / 61379 : 95.5199508667 (51.7055549622s)
4000 / 61379 : 87.2269592285 (50.9664750099s)
5000 / 61379 : 86.1722946167 (48.1180589199s)
6000 / 61379 : 86.2194137573 (44.4488229752s)
7000 / 61379 : 83.4306411743 (52.9158499241s)
8000 / 61379 : 90.6543731689 (55.6347739697s)
9000 / 61379 : 85.1919784546 (44.203099966s)
10000 / 61379 : 87.9598083496 (50.2284829617s)
11000 / 61379 : 80.7679290771 (44.71124506s)
12000 / 61379 : 88.1275863647 (55.7020790577s)
13000 / 61379 : 80.8079452515 (45.524091959s)
14000 / 61379 : 85.689125061 (73.5832419395s)
15000 / 61379 : 82.4831924438 (48.8700990677s)
16000 / 61379 : 79.7526016235 (47.0941290855s)
17000 / 61379 : 82.8972396851 (54.9411220551s)
18000 / 61379 : 80.2893676758 (61.2505581379s)
19000 / 61379 : 80.5836029053 (50.5075311661s)
20000 / 61379 : 79.1858825684 (62.0834269524s)
21000 / 61379 : 73.

NameError: global name 'serializers' is not defined

In [20]:
serializers.save_npz('gpu.model', model)

# Output Answers

In [21]:
def get_test_batch(i):    
    ctx = []
    qn = []
    
    cmax = 0
    qmax = 0
    
    testData = test[i]
    for qna in testData['qna']:
        ctx.append(testData['contextVec'])
        qn.append(qna['questionVec'])
        cmax = max(cmax, testData['contextVec'].shape[1])
        qmax = max(cmax, qna['questionVec'].shape[1])
    
    batch_size = len(qn)
    cVec = np.zeros((batch_size, cmax), dtype=np.float32)
    qVec = np.zeros((batch_size, qmax), dtype=np.float32)        
    for i in range(batch_size):
        cVec[i, 0:ctx[i].shape[1]] = ctx[i]
        qVec[i, 0:qn[i].shape[1]] = qn[i]
    
    return Variable(cVec), Variable(qVec)

In [22]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()
    
    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [None]:
f = open('pred.csv', 'wb')
out = csv.writer(f)
out.writerow(["Id", "Answer"])

startTime = time.time()
for idx, t in enumerate(test):
    contextTokens = word_tokenize(t['context'])
    
    ctx, qn = get_test_batch(idx)
    model.reset_state()
    start, end = model(ctx, qn)
    
    for idx_qna, qna in enumerate(t['qna']):
        id = qna['id']
      
        s = F.argmax(start[idx_qna]).data
        e = F.argmax(end[idx_qna]).data
        
        s = min(s, len(contextTokens)-1)
        e = max(e, s)
        e = min(e, len(contextTokens)-1)        
        
        ans = ""
        for i in range(s, e + 1):
            ans += contextTokens[i] + " "
        
        out.writerow([id, normalize_answer(ans).encode('utf-8')])
    
    if idx % 100 == 0:
        print idx, "/", len(test), "(" + str(time.time() - startTime) + "s)"
        startTime = time.time()
    
f.close()

0 / 7984 (1.47932910919s)
100 / 7984 (73.9849860668s)
200 / 7984 (49.1458120346s)
300 / 7984 (57.2862179279s)
400 / 7984 (50.1581408978s)
500 / 7984 (61.8074829578s)
