# Imports and Settings

In [197]:
%matplotlib inline

import csv, json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk import word_tokenize

from chainer import Chain, Variable, Parameter
import chainer.optimizers
import chainer.initializers as I
import chainer.functions as F
import chainer.links as L

WORD_VECTOR_SIZE = 50
H_SIZE = 35

# Read Word Vectors

In [392]:
glove = {}
f = open('glove/glove.6B.' + str(WORD_VECTOR_SIZE) + 'd.txt', 'rb')
reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
for row in reader:
    key = row[0]
    vector = map(float, row[1:])
    glove[key] = np.array(vector, dtype=np.float32).reshape(1,-1)
len(glove)

400000

# Read Dataset

In [179]:
def text2vec(text):
    tokens = word_tokenize(text)
    textVec = []
    for tok in tokens:
        textVec.append(glove.get(tok, np.zeros((1,WORD_VECTOR_SIZE), dtype=np.float32)))
    return textVec

def answerpos(context, answer, answer_start):
    start = len(word_tokenize(context[:answer_start]))
    ans_len = len(word_tokenize(answer))
    
    return start, start + ans_len - 1

In [185]:
train = []
for jsonRow in json.loads(open('dataset/train.json', 'rb').read()):
    for paragraph in jsonRow['paragraphs']: 
        data = {}
        data['context'] = paragraph['context']
        data['contextVec'] = text2vec(paragraph['context'])
        
        data['qna'] = []
        for qnaJson in paragraph['qas']:
            qna = {}
            qna['question'] = qnaJson['question']
            qna['questionVec'] = text2vec(qnaJson['question'].lower())
            
            qna['answer'] = qnaJson['answer']['text']
            ans_start, ans_end = answerpos(paragraph['context'], 
                                           qnaJson['answer']['text'], 
                                           qnaJson['answer']['answer_start'])
            qna['answer_start'] = ans_start
            qna['answer_end'] = ans_end
            
            data['qna'].append(qna)
        train.append(data)
print len(train)

idx = 4
print train[0]['context']
print train[0]['qna'][idx]['answer']
print train[0]['qna'][idx]['answer_start'], word_tokenize(train[0]['context'])[train[0]['qna'][idx]['answer_start']]
print train[0]['qna'][idx]['answer_end'], word_tokenize(train[0]['context'])[train[0]['qna'][idx]['answer_end']]

12979
In 1790, the first federal population census was taken in the United States. Enumerators were instructed to classify free residents as white or "other." Only the heads of households were identified by name in the federal census until 1850. Native Americans were included among "Other;" in later censuses, they were included as "Free people of color" if they were not living on Indian reservations. Slaves were counted separately from free persons in all the censuses until the Civil War and end of slavery. In later censuses, people of African descent were classified by appearance as mulatto (which recognized visible European ancestry in addition to African) or black.
In 1790, the first federal population census was taken in the United States
0 In
13 States


In [6]:
test = []
for jsonRow in json.loads(open('dataset/test.json', 'rb').read()):
    for paragraph in jsonRow['paragraphs']: 
        data = {}
        data['context'] = paragraph['context']
        data['contextVec'] = text2vec(paragraph['context'])
        data['qna'] = []
        for qnaJson in paragraph['qas']:
            qna = {}
            qna['question'] = qnaJson['question']
            qna['questionVec'] = text2vec(qnaJson['question'].lower())
            data['qna'].append(qna)
        test.append(data)

print len(test)                  

7984

# Define Network
* RNN Tutorial: http://docs.chainer.org/en/stable/tutorial/recurrentnet.html
* Training Tutorial: http://docs.chainer.org/en/stable/tutorial/train_loop.html
* Attention: https://machinelearningmastery.com/how-does-attention-work-in-encoder-decoder-recurrent-neural-networks/
* Pointer: http://fastml.com/introduction-to-pointer-networks/

In [379]:
class CoattentionEncoder(Chain):
    def __init__(self, wordvec_size, h_size):
        super(CoattentionEncoder, self).__init__()
        
        self.h_size = h_size
        
        with self.init_scope():
            self.ctxRU = L.LSTM(wordvec_size, h_size)

            self.qnRU = L.LSTM(wordvec_size, h_size)
            self.qnLinear = L.Linear(h_size, h_size)
            
            self.outFwd = L.LSTM(3*h_size, h_size)
            self.outBwd = L.LSTM(3*h_size, h_size)
            self.outLinear = L.Linear(2*h_size, h_size)            
            
    def __call__(self, context, question):
        # context representation
        D = []
        for word in context:
            D.append(self.ctxRU(word))
        D = F.vstack(D).T
        
        #question representation
        Q = []
        for word in question:
            Q.append(self.qnRU(word))
        Q = self.qnLinear(F.vstack(Q)).T
        
        #attention
        affinity = F.matmul(D.T, Q)
        A_Q = F.softmax(affinity)
        A_D = F.softmax(affinity.T)
        
        C_Q = F.matmul(D, A_Q)
        C_D = F.matmul(F.concat((Q, C_Q), axis=0), A_D)
        
        #output
        out_in = F.concat((D, C_D), axis=0).T
        
        h_fwd = []
        for fout in out_in:
            h_fwd.append(self.outFwd(fout.reshape(1,-1)))
        h_fwd = F.vstack(h_fwd)
        
        h_bwd = []
        for bout in out_in[::-1]:
            h_bwd.append(self.outBwd(bout.reshape(1,-1)))
        h_bwd = F.vstack(h_bwd)
    
        U = self.outLinear(F.concat((h_fwd, h_bwd)))
        
        return U

In [393]:
ctx = train[0]['contextVec']
qn = train[0]['qna'][0]['questionVec']
encoder = CoattentionEncoder(WORD_VECTOR_SIZE, H_SIZE)

U = encoder(ctx, qn)
print U.shape
print U

(125, 35)
variable([[-0.03590989  0.03966875 -0.00195666 ..., -0.03458286  0.03548717
            0.04075096]
          [-0.04598446  0.09996913  0.00479678 ..., -0.05634297  0.05540708
            0.06247892]
          [-0.03036514  0.10478616  0.00552048 ..., -0.02236877  0.07590895
            0.09246501]
          ..., 
          [-0.11714082  0.11934105 -0.08918121 ...,  0.05227913 -0.00929801
            0.06507393]
          [-0.11857323  0.14307797 -0.127801   ...,  0.04839102 -0.00928849
            0.06607775]
          [-0.11303589  0.15104374 -0.15047826 ...,  0.05811252 -0.01807822
            0.0032901 ]])


In [357]:
class Highway(Chain):
    def __init__(self, h_size):
        super(Highway, self).__init__()
                
        with self.init_scope():
            self.MLP = L.Linear(3*h_size, h_size, nobias=True)
            self.M1 = L.Linear(2*h_size, h_size)
            self.M2 = L.Linear(h_size, h_size)
            self.M3 = L.Linear(2*h_size, 1)
            
    def __call__(self, U, h, us, ue):
        r = F.tanh(self.MLP(F.hstack([h, us, ue])))
        m1 = self.M1(F.concat((U, F.broadcast_to(r, U.shape))))
        m2 = self.M2(m1)
        m3 = self.M3(F.concat((m1,m2)))
        
        return m3

In [383]:
highway = Highway(H_SIZE)

h = np.zeros((1,H_SIZE), dtype=np.float32)
us = U[0].reshape(1,-1)
ue = U[-1].reshape(1,-1)

alpha = highway(U, h, us, ue)
print alpha.shape

(125, 1)


In [390]:
class DynamicPointingDecoder(Chain):
    def __init__(self, h_size):
        super(DynamicPointingDecoder, self).__init__()
                
        with self.init_scope():
            self.LSTM_dec = L.LSTM(2*h_size, h_size)
            self.HwayStart = Highway(h_size)
            self.HwayEnd = Highway(h_size)
            
    def __call__(self, U, us, ue):
        h = self.LSTM_dec(F.concat((us,ue)))
        alpha = highway(U, h, us, ue)
        s = F.argmax(alpha).data
        beta = highway(U, h, U[s].reshape(1,-1), ue)
        
        return alpha, beta

In [394]:
decoder = DynamicPointingDecoder(H_SIZE)

us = U[0].reshape(1,-1)
ue = U[-1].reshape(1,-1)
alpha, beta = decoder(U, us, ue)
print alpha.shape, F.argmax(alpha)
print beta.shape, F.argmax(beta)

(125, 1) variable(118)
(125, 1) variable(118)
