In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import gensim
import pickle
USE_CUDA = torch.cuda.is_available()

from data import load_squad_data,preprop,getBatch,pad_to_batch
from model import CoattentionEncoder, DynamicDecoder

In [2]:
MAX_LEN=400

In [3]:
dataset = load_squad_data('dataset/train-v1.1.json',MAX_LEN)
word2index,train_data = preprop(dataset)

Skipped 761, 86655 question/answer
Successfully Build 114853 vocabs
Preprop Complete!


In [4]:
pickle.dump(word2index,open('dataset/vocab.squad','wb'))
pickle.dump(train_data,open('dataset/train.squad','wb'))

In [4]:
%%time
#python3 -m gensim.scripts.glove2word2vec --input  glove.840B.300d.txt --output glove.840B.300d.w2vformat.txt
model = gensim.models.KeyedVectors.load_word2vec_format('dataset/glove.840B.300d.w2vformat.txt')

CPU times: user 5min 40s, sys: 7.74 s, total: 5min 47s
Wall time: 9min 35s


In [5]:
pretrained = []
oov_idx=[]

for i,key in enumerate(word2index.keys()):
    try:
        pretrained.append(model[key])
    except:
        pretrained.append(np.zeros(300))
        oov_idx.append(i)
        
pretrained_vectors = np.vstack(pretrained)
print(len(oov_idx),"/",len(word2index))

22529 / 114853


In [4]:
RESTORE=True
EMBED_SIZE=300
HIDDEN_SIZE=200
MAXOUT_POOL=4
MAX_ITER=4
BATCH_SIZE=32
STEP=50
LR=0.001
encoder = CoattentionEncoder(len(word2index),EMBED_SIZE,HIDDEN_SIZE)
decoder = DynamicDecoder(HIDDEN_SIZE,MAXOUT_POOL,max_iter=MAX_ITER)
if RESTORE is False:
    encoder.init_embed(pretrained_vectors,is_static=False)

if RESTORE:
    encoder.load_state_dict(torch.load('models/enc_params_01_24.pkl'))
    decoder.load_state_dict(torch.load('models/dec_params_01_24.pkl'))

if USE_CUDA:
    encoder.use_cuda=True
    decoder.use_cuda=True
    encoder = encoder.cuda()
    decoder = decoder.cuda()
loss_function = nn.CrossEntropyLoss()
enc_optim = optim.Adam(filter(lambda p: p.requires_grad, encoder.parameters()),lr=LR)
dec_optim = optim.Adam(decoder.parameters(),lr=LR)

In [5]:
LR=0.0001
STEP=10
enc_optim = optim.Adam(filter(lambda p: p.requires_grad, encoder.parameters()),lr=LR)
dec_optim = optim.Adam(decoder.parameters(),lr=LR*5)

In [7]:
for step in range(STEP):
    losses=[]
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        documents,questions,doc_lens,question_lens,starts,ends = pad_to_batch(batch,word2index)

        encoder.zero_grad()
        decoder.zero_grad()
        U = encoder(documents,questions,doc_lens,question_lens,True)
        _,_,entropies = decoder(U,True)

        s_ents, e_ents = list(zip(*entropies)) # x MAX_ITER
        loss_start,loss_end=0,0
        for m in range(len(entropies)):
            loss_start+=loss_function(s_ents[m],starts.view(-1))
            loss_end+=loss_function(s_ents[m],ends.view(-1))

        loss = loss_start+loss_end
        losses.append(loss.data[0])
        loss.backward()
        #torch.nn.utils.clip_grad_norm(encoder.parameters(), 50) # gradient clipping
        #torch.nn.utils.clip_grad_norm(decoder.parameters(), 50) 
        enc_optim.step()
        dec_optim.step()
        
        if i % 100 == 0:
            print("[%d/%d] [%d/%d] loss : %.3f" % (step,STEP,i,len(train_data)//BATCH_SIZE,np.mean(losses)))
            losses=[]

[0/10] [0/2707] loss : 4.710
[0/10] [100/2707] loss : 4.169
[0/10] [200/2707] loss : 4.159
[0/10] [300/2707] loss : 4.214
[0/10] [400/2707] loss : 4.162
[0/10] [500/2707] loss : 4.126
[0/10] [600/2707] loss : 4.079
[0/10] [700/2707] loss : 4.134
[0/10] [800/2707] loss : 4.136
[0/10] [900/2707] loss : 4.089
[0/10] [1000/2707] loss : 4.127
[0/10] [1100/2707] loss : 4.194
[0/10] [1200/2707] loss : 4.063
[0/10] [1300/2707] loss : 4.131
[0/10] [1400/2707] loss : 4.211
[0/10] [1500/2707] loss : 4.145
[0/10] [1600/2707] loss : 4.173
[0/10] [1700/2707] loss : 4.165
[0/10] [1800/2707] loss : 4.186
[0/10] [1900/2707] loss : 4.070
[0/10] [2000/2707] loss : 4.235
[0/10] [2100/2707] loss : 4.133
[0/10] [2200/2707] loss : 4.208
[0/10] [2300/2707] loss : 4.142
[0/10] [2400/2707] loss : 4.115
[0/10] [2500/2707] loss : 4.163
[0/10] [2600/2707] loss : 4.168
[0/10] [2700/2707] loss : 4.119
[1/10] [0/2707] loss : 4.484
[1/10] [100/2707] loss : 4.087
[1/10] [200/2707] loss : 4.069
[1/10] [300/2707] loss : 

KeyboardInterrupt: 

### Test 

In [8]:
import random
index2word={v:k for k,v in word2index.items()}

In [9]:
encoder = encoder.cuda()
decoder = decoder.cuda()

In [10]:
test_data = load_squad_data('./dataset/dev-v1.1.json')
word2index, test_data = preprop(test_data,word2index)

Skipped 177, 10384 question/answer
Successfully Build 114853 vocabs
Preprop Complete!


In [102]:
test = random.choice(train_data)

U = encoder(test[0],test[1],torch.LongTensor([test[0].size(1)]).cuda(),torch.LongTensor([test[1].size(1)]).cuda())
s,e,entropies = decoder(U)

#         s_ents, e_ents = list(zip(*entropies)) 
test_paragraph=[index2word[p] for p in test[0].data.cpu().tolist()[0]]
print(" ".join(test_paragraph))
print(" ")
print(" ".join([index2word[p] for p in test[1].data.cpu().tolist()[0]]))
print(" ")
if s.data[0]<=e.data[0]:
    print("Prediction : "," ".join(test_paragraph[s.data[0]:e.data[0]+1]))
else:
    print("Prediction : "," ".join(test_paragraph[e.data[0]:s.data[0]+1]))
print("Groud Truth : "," ".join(test_paragraph[test[2].data.tolist()[0][0]:test[3].data.tolist()[0][0]+1]))

In June 1989 , Nintendo of America 's vice president of marketing Peter Main , said that the Famicom was present in 37 % of Japan 's households . By 1990 , 30 % of American households owned the NES , compared to 23 % for all personal computers . By 1990 , the NES had outsold all previously released consoles worldwide . [ better source needed ] The slogan for this brand was It ca n't be beaten . In Europe and South America , however , the NES was outsold by Sega 's Master System , while the Nintendo Entertainment System was not available in the Soviet Union .
 
Where was Nintendo not available ?
 
Prediction :  Soviet Union
Groud Truth :  Soviet Union


In [None]:
overlap=0
predicted=0
truth=0

for test in test_data:
    U = encoder(test[0],test[1])
    s,e,entropies = decoder(U)
    
    pred_span = list(range(s.data[0],e.data[0]+1))
    truth_span = list(range(test[2].squeeze(0).data[0],test[3].squeeze(0).data[0]+1))
    overlap+=len(set(truth_span) & set(pred_span))
    predicted+=len(pred_span)
    truth+=len(truth_span)
#     break
precision = overlap/predicted
recall = overlap/truth

f1_score = 2*precision*recall/(precision+recall)
print(f1_score)

In [103]:
import datetime
cdate = datetime.datetime.strftime(datetime.datetime.now(),"%m_%d")

In [104]:
if USE_CUDA:
    encoder = encoder.cpu()
    decoder = decoder.cpu()
torch.save(encoder.state_dict(), 'models/enc_params_'+cdate+'.pkl')
torch.save(decoder.state_dict(), 'models/dec_params_'+cdate+'.pkl')