In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import gensim
import pickle
USE_CUDA = torch.cuda.is_available()

from data import load_squad_data,preprop,getBatch,pad_to_batch
from model import CoattentionEncoder, DynamicDecoder

In [2]:
MAX_LEN=400

In [3]:
dataset = load_squad_data('dataset/train-v1.1.json',MAX_LEN)
word2index,train_data = preprop(dataset)

Skipped 761, 86655 question/answer
Successfully Build 114855 vocabs
Preprop Complete!


In [4]:
pickle.dump(word2index,open('dataset/vocab.squad','wb'))
pickle.dump(train_data,open('dataset/train.squad','wb'))

In [5]:
%%time
#python3 -m gensim.scripts.glove2word2vec --input  glove.840B.300d.txt --output glove.840B.300d.w2vformat.txt
model = gensim.models.KeyedVectors.load_word2vec_format('dataset/glove.840B.300d.w2vformat.txt')

CPU times: user 5min 12s, sys: 2.31 s, total: 5min 14s
Wall time: 5min 13s


In [5]:
# oov=[]
# for k in word2index.keys():
#     if k not in ['<pad>','<unk>','<s>','</s>'] and model.vocab.get(k) is None:
#         oov.append(k)
# for o in oov:
#     word2index.pop(o)
# print(len(oov),len(word2index))

22527 92328


In [6]:
pretrained = []

for i in range(len(word2index)):
    try:
        pretrained.append(model[word2index[i]])
    except:
        pretrained.append(np.zeros(300))
        
pretrained_vectors = np.vstack(pretrained)

In [5]:
# del oov
# del pretrained
# del model

# word2index,train_data = preprop(dataset,word2index)

In [9]:
RESTORE=False
EMBED_SIZE=300
HIDDEN_SIZE=200
MAXOUT_POOL=4
MAX_ITER=4
BATCH_SIZE=64
STEP=50
LR=0.001
encoder = CoattentionEncoder(len(word2index),EMBED_SIZE,HIDDEN_SIZE)
decoder = DynamicDecoder(HIDDEN_SIZE,MAXOUT_POOL,max_iter=MAX_ITER)
encoder.init_embed(pretrained_vectors,is_static=False)

if RESTORE:
    encoder.load_state_dict(torch.load('models/enc_params.pkl'))
    decoder.load_state_dict(torch.load('models/dec_params.pkl'))

if USE_CUDA:
    encoder.use_cuda=True
    decoder.use_cuda=True
    encoder = encoder.cuda()
    decoder = decoder.cuda()
loss_function = nn.CrossEntropyLoss()
enc_optim = optim.Adam(filter(lambda p: p.requires_grad, encoder.parameters()),lr=LR)
dec_optim = optim.Adam(decoder.parameters(),lr=LR)

In [9]:
LR=0.0001
enc_optim = optim.Adam(filter(lambda p: p.requires_grad, encoder.parameters()),lr=LR)
dec_optim = optim.Adam(decoder.parameters(),lr=LR)

In [10]:
for step in range(STEP):
    losses=[]
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        documents,questions,starts,ends = pad_to_batch(batch,word2index)

        encoder.zero_grad()
        decoder.zero_grad()
        U = encoder(documents,questions,True)
        _,_,entropies = decoder(U,True)

        s_ents, e_ents = list(zip(*entropies)) # x MAX_ITER
        loss_start,loss_end=0,0
        for m in range(MAX_ITER+1):
            loss_start+=loss_function(s_ents[m],starts.view(-1))
            loss_end+=loss_function(s_ents[m],ends.view(-1))

        loss = loss_start+loss_end
        losses.append(loss.data[0])
        loss.backward()
        #torch.nn.utils.clip_grad_norm(encoder.parameters(), 50) # gradient clipping
        #torch.nn.utils.clip_grad_norm(decoder.parameters(), 50) 
        enc_optim.step()
        dec_optim.step()
        
        if i % 100 == 0:
            print("[%d/%d] [%d/%d] loss : %.3f" % (step,STEP,i,len(train_data)//BATCH_SIZE,np.mean(losses)))
            losses=[]

[0/50] [0/1353] loss : 57.169
[0/50] [100/1353] loss : 49.803
[0/50] [200/1353] loss : 44.247
[0/50] [300/1353] loss : 42.723
[0/50] [400/1353] loss : 42.139
[0/50] [500/1353] loss : 41.574
[0/50] [600/1353] loss : 41.339
[0/50] [700/1353] loss : 40.882
[0/50] [800/1353] loss : 40.487
[0/50] [900/1353] loss : 39.597
[0/50] [1000/1353] loss : 39.417
[0/50] [1100/1353] loss : 38.186
[0/50] [1200/1353] loss : 37.954
[0/50] [1300/1353] loss : 37.443
[1/50] [0/1353] loss : 37.994
[1/50] [100/1353] loss : 34.126
[1/50] [200/1353] loss : 33.856
[1/50] [300/1353] loss : 33.781
[1/50] [400/1353] loss : 33.393
[1/50] [500/1353] loss : 33.480
[1/50] [600/1353] loss : 33.275
[1/50] [700/1353] loss : 33.195
[1/50] [800/1353] loss : 33.079
[1/50] [900/1353] loss : 32.664
[1/50] [1000/1353] loss : 31.922
[1/50] [1100/1353] loss : 31.370
[1/50] [1200/1353] loss : 31.049
[1/50] [1300/1353] loss : 30.029
[2/50] [0/1353] loss : 27.418
[2/50] [100/1353] loss : 23.787
[2/50] [200/1353] loss : 23.214
[2/50]

KeyboardInterrupt: 

In [11]:
import random

In [26]:
encoder = encoder.cuda()
decoder = decoder.cuda()

In [39]:
i = random.choice(train_data)

U = encoder(i[0],i[1])
s,e,entropies = decoder(U)

#         s_ents, e_ents = list(zip(*entropies)) 
print(s,e)
print(i[2],i[3])

Variable containing:
 5
[torch.cuda.LongTensor of size 1 (GPU 0)]
 Variable containing:
 6
[torch.cuda.LongTensor of size 1 (GPU 0)]

Variable containing:
 5
[torch.cuda.LongTensor of size 1x1 (GPU 0)]
 Variable containing:
 6
[torch.cuda.LongTensor of size 1x1 (GPU 0)]



In [25]:
if USE_CUDA:
    encoder = encoder.cpu()
    decoder = decoder.cpu()
torch.save(encoder.state_dict(), 'models/enc_params.pkl')
torch.save(decoder.state_dict(), 'models/dec_params.pkl')