In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import gensim
import pickle
USE_CUDA = torch.cuda.is_available()

from data import load_squad_data,preprop,getBatch,pad_to_batch
from model import CoattentionEncoder, DynamicDecoder

In [2]:
MAX_LEN=400

In [3]:
dataset = load_squad_data('dataset/train-v1.1.json',MAX_LEN)
word2index,train_data = preprop(dataset)

Skipped 761, 86655 question/answer
Successfully Build 114855 vocabs
Preprop Complete!


In [4]:
pickle.dump(word2index,open('vocab.squad','wb'))
pickle.dump(train_data,open('train.squad','wb'))

In [5]:
%%time
#python3 -m gensim.scripts.glove2word2vec --input  glove.840B.300d.txt --output glove.840B.300d.w2vformat.txt
model = gensim.models.KeyedVectors.load_word2vec_format('dataset/glove.840B.300d.w2vformat.txt')

CPU times: user 5min 16s, sys: 2.34 s, total: 5min 18s
Wall time: 5min 29s


In [5]:
# oov=[]
# for k in word2index.keys():
#     if k not in ['<pad>','<unk>','<s>','</s>'] and model.vocab.get(k) is None:
#         oov.append(k)
# for o in oov:
#     word2index.pop(o)
# print(len(oov),len(word2index))

22527 92328


In [6]:
pretrained = []

for i in range(len(word2index)):
    try:
        pretrained.append(model[word2index[i]])
    except:
        pretrained.append(np.random.randn(300))
        
pretrained_vectors = np.vstack(pretrained)

In [7]:
# del oov
# del pretrained
# del model

# word2index,train_data = preprop(dataset,word2index)

In [8]:
RESTORE=True
EMBED_SIZE=300
HIDDEN_SIZE=200
MAXOUT_POOL=4
MAX_ITER=4
BATCH_SIZE=32
STEP=50
LR=0.0001
encoder = CoattentionEncoder(len(word2index),EMBED_SIZE,HIDDEN_SIZE)
decoder = DynamicDecoder(HIDDEN_SIZE,MAXOUT_POOL,max_iter=MAX_ITER)
encoder.init_embed(pretrained_vectors)

if RESTORE:
    encoder.load_state_dict(torch.load('models/enc_params.pkl'))
    decoder.load_state_dict(torch.load('models/dec_params.pkl'))

if USE_CUDA:
    encoder.use_cuda=True
    decoder.use_cuda=True
    encoder = encoder.cuda()
    decoder = decoder.cuda()
loss_function = nn.CrossEntropyLoss()
enc_optim = optim.Adam(filter(lambda p: p.requires_grad, encoder.parameters()),lr=LR)
dec_optim = optim.Adam(decoder.parameters(),lr=LR)

In [14]:
LR=0.0001
enc_optim = optim.Adam(filter(lambda p: p.requires_grad, encoder.parameters()),lr=LR)
dec_optim = optim.Adam(decoder.parameters(),lr=LR)

In [15]:
for step in range(STEP):
    losses=[]
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        documents,questions,starts,ends = pad_to_batch(batch,word2index)

        encoder.zero_grad()
        decoder.zero_grad()
        U = encoder(documents,questions,True)
        _,_,entropies = decoder(U,True)

        s_ents, e_ents = list(zip(*entropies)) # x MAX_ITER
        loss_start,loss_end=0,0
        for m in range(MAX_ITER):
            loss_start+=loss_function(s_ents[m],starts.view(-1))
            loss_end+=loss_function(s_ents[m],ends.view(-1))

        loss = loss_start+loss_end
        losses.append(loss.data[0])
        loss.backward()
        torch.nn.utils.clip_grad_norm(encoder.parameters(), 10) # gradient clipping
        torch.nn.utils.clip_grad_norm(decoder.parameters(), 10) 
        enc_optim.step()
        dec_optim.step()
        
        if i % 100 == 0:
            print("[%d/%d] [%d/%d] loss : %.3f" % (step,STEP,i,len(train_data)//BATCH_SIZE,np.mean(losses)))
            losses=[]

[0/50] [0/2707] loss : 6.112
[0/50] [100/2707] loss : 5.941
[0/50] [200/2707] loss : 5.953
[0/50] [300/2707] loss : 6.015
[0/50] [400/2707] loss : 5.837
[0/50] [500/2707] loss : 5.988
[0/50] [600/2707] loss : 5.972
[0/50] [700/2707] loss : 5.855
[0/50] [800/2707] loss : 6.005
[0/50] [900/2707] loss : 5.942
[0/50] [1000/2707] loss : 5.947
[0/50] [1100/2707] loss : 5.976
[0/50] [1200/2707] loss : 5.837
[0/50] [1300/2707] loss : 5.666
[0/50] [1400/2707] loss : 5.825
[0/50] [1500/2707] loss : 5.771
[0/50] [1600/2707] loss : 5.832
[0/50] [1700/2707] loss : 5.621
[0/50] [1800/2707] loss : 5.848
[0/50] [1900/2707] loss : 5.834
[0/50] [2000/2707] loss : 5.761
[0/50] [2100/2707] loss : 5.813
[0/50] [2200/2707] loss : 5.728
[0/50] [2300/2707] loss : 5.652
[0/50] [2400/2707] loss : 5.823
[0/50] [2500/2707] loss : 5.764
[0/50] [2600/2707] loss : 5.785
[0/50] [2700/2707] loss : 5.880
[1/50] [0/2707] loss : 6.744
[1/50] [100/2707] loss : 5.777
[1/50] [200/2707] loss : 5.873
[1/50] [300/2707] loss : 

KeyboardInterrupt: 

In [18]:
import random

In [20]:
encoder = encoder.cuda()
decoder = decoder.cuda()

In [30]:
i = random.choice(train_data)

U = encoder(i[0],i[1])
s,e,entropies = decoder(U)

#         s_ents, e_ents = list(zip(*entropies)) 
print(s,e)
print(i[2],i[3])

Variable containing:
 50
[torch.cuda.LongTensor of size 1 (GPU 0)]
 Variable containing:
 73
[torch.cuda.LongTensor of size 1 (GPU 0)]

Variable containing:
 50
[torch.cuda.LongTensor of size 1x1 (GPU 0)]
 Variable containing:
 51
[torch.cuda.LongTensor of size 1x1 (GPU 0)]



In [None]:
if USE_CUDA:
    encoder = encoder.cpu()
    decoder = decoder.cpu()
torch.save(encoder.state_dict(), 'models/enc_params.pkl')
torch.save(decoder.state_dict(), 'models/dec_params.pkl')