In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import gensim
import pickle
USE_CUDA = torch.cuda.is_available()

from data import load_squad_data,preprop,getBatch,pad_to_batch
from model import CoattentionEncoder, DynamicDecoder

In [2]:
MAX_LEN=600

In [3]:
dataset = load_squad_data('dataset/train-v1.1.json',MAX_LEN)
word2index,train_data = preprop(dataset)

Skipped 763, 86817 question/answer
Successfully Build 115266 vocabs
Preprop Complete!


In [4]:
pickle.dump(word2index,open('dataset/vocab.squad','wb'))
pickle.dump(train_data,open('dataset/train.squad','wb'))

In [4]:
%%time
#python3 -m gensim.scripts.glove2word2vec --input  glove.840B.300d.txt --output glove.840B.300d.w2vformat.txt
model = gensim.models.KeyedVectors.load_word2vec_format('dataset/glove.840B.300d.w2vformat.txt')

CPU times: user 5min 12s, sys: 3 s, total: 5min 15s
Wall time: 5min 14s


In [5]:
pretrained = []
oov_idx=[]

for i,key in enumerate(word2index.keys()):
    try:
        pretrained.append(model[key])
    except:
        pretrained.append(np.zeros(300))
        oov_idx.append(i)
        
pretrained_vectors = np.vstack(pretrained)
print(len(oov_idx),"/",len(word2index))

22637 / 115266


In [4]:
RESTORE=True
EMBED_SIZE=300
HIDDEN_SIZE=200
MAXOUT_POOL=4
MAX_ITER=4
BATCH_SIZE=64
STEP=50
LR=0.001
encoder = CoattentionEncoder(len(word2index),EMBED_SIZE,HIDDEN_SIZE)
decoder = DynamicDecoder(HIDDEN_SIZE,MAXOUT_POOL,max_iter=MAX_ITER)
if RESTORE is False:
    encoder.init_embed(pretrained_vectors,is_static=True)

if RESTORE:
    encoder.load_state_dict(torch.load('models/enc_params_01_29.pkl'))
    decoder.load_state_dict(torch.load('models/dec_params_01_29.pkl'))

if USE_CUDA:
    encoder.use_cuda=True
    decoder.use_cuda=True
    encoder = encoder.cuda()
    decoder = decoder.cuda()
loss_function = nn.CrossEntropyLoss()
enc_optim = optim.Adam(filter(lambda p: p.requires_grad, encoder.parameters()),lr=LR)
dec_optim = optim.Adam(decoder.parameters(),lr=LR)

In [5]:
LR=0.0001
STEP=50
enc_optim = optim.Adam(filter(lambda p: p.requires_grad, encoder.parameters()),lr=LR)
dec_optim = optim.Adam(decoder.parameters(),lr=LR*5)

In [6]:
for step in range(STEP):
    losses=[]
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        documents,questions,doc_lens,question_lens,starts,ends = pad_to_batch(batch,word2index)

        encoder.zero_grad()
        decoder.zero_grad()
        U = encoder(documents,questions,doc_lens,question_lens,True)
        _,_,entropies = decoder(U,True)

        s_ents, e_ents = list(zip(*entropies)) # x MAX_ITER
        loss_start,loss_end=0,0
        for m in range(len(entropies)):
            loss_start+=loss_function(s_ents[m],starts.view(-1))
            loss_end+=loss_function(s_ents[m],ends.view(-1))

        loss = loss_start+loss_end
        losses.append(loss.data[0])
        loss.backward()
        #torch.nn.utils.clip_grad_norm(encoder.parameters(), 50) # gradient clipping
        #torch.nn.utils.clip_grad_norm(decoder.parameters(), 50) 
        enc_optim.step()
        dec_optim.step()
        
        if i % 100 == 0:
            print("[%d/%d] [%d/%d] loss : %.3f" % (step,STEP,i,len(train_data)//BATCH_SIZE,np.mean(losses)))
            losses=[]

[0/50] [0/1356] loss : 9.992
[0/50] [100/1356] loss : 9.593
[0/50] [200/1356] loss : 9.511
[0/50] [300/1356] loss : 9.391
[0/50] [400/1356] loss : 9.471
[0/50] [500/1356] loss : 9.653
[0/50] [600/1356] loss : 9.550
[0/50] [700/1356] loss : 9.160
[0/50] [800/1356] loss : 9.557
[0/50] [900/1356] loss : 9.449
[0/50] [1000/1356] loss : 9.430
[0/50] [1100/1356] loss : 9.514
[0/50] [1200/1356] loss : 9.663
[0/50] [1300/1356] loss : 9.519
[1/50] [0/1356] loss : 7.834
[1/50] [100/1356] loss : 9.376
[1/50] [200/1356] loss : 9.232
[1/50] [300/1356] loss : 9.363
[1/50] [400/1356] loss : 9.301
[1/50] [500/1356] loss : 9.229
[1/50] [600/1356] loss : 9.068
[1/50] [700/1356] loss : 9.246
[1/50] [800/1356] loss : 9.282
[1/50] [900/1356] loss : 9.295
[1/50] [1000/1356] loss : 9.308
[1/50] [1100/1356] loss : 9.236
[1/50] [1200/1356] loss : 9.249
[1/50] [1300/1356] loss : 9.335
[2/50] [0/1356] loss : 8.410
[2/50] [100/1356] loss : 9.053
[2/50] [200/1356] loss : 8.985
[2/50] [300/1356] loss : 8.993
[2/50]

KeyboardInterrupt: 

### Test 

In [7]:
import random
index2word={v:k for k,v in word2index.items()}

In [9]:
encoder = encoder.cuda()
decoder = decoder.cuda()

In [10]:
test_data = load_squad_data('./dataset/dev-v1.1.json')
word2index, test_data = preprop(test_data,word2index)

Skipped 177, 10384 question/answer
Successfully Build 114853 vocabs
Preprop Complete!


In [25]:
test = random.choice(train_data)

U = encoder(test[0],test[1],torch.LongTensor([test[0].size(1)]).cuda(),torch.LongTensor([test[1].size(1)]).cuda())
s,e,entropies = decoder(U)

#         s_ents, e_ents = list(zip(*entropies)) 
test_paragraph=[index2word[p] for p in test[0].data.cpu().tolist()[0]]
print(" ".join(test_paragraph))
print(" ")
print(" ".join([index2word[p] for p in test[1].data.cpu().tolist()[0]]))
print(" ")
if s.data[0]<=e.data[0]:
    print("Prediction : "," ".join(test_paragraph[s.data[0]:e.data[0]+1]))
else:
    print("Prediction : "," ".join(test_paragraph[e.data[0]:s.data[0]+1]))
print("Groud Truth : "," ".join(test_paragraph[test[2].data.tolist()[0][0]:test[3].data.tolist()[0][0]+1]))

The success of American Idol has been described as " unparalleled in broadcasting history " . The series was also said by a rival TV executive to be " the most impactful show in the history of television " . It has become a recognized springboard for launching the career of many artists as bona fide stars . According to Billboard magazine , in its first ten years , " Idol has spawned 345 Billboard chart-toppers and a platoon of pop idols , including Kelly Clarkson , Carrie Underwood , Chris Daughtry , Fantasia , Ruben Studdard , Jennifer Hudson , Clay Aiken , Adam Lambert and Jordin Sparks while remaining a TV ratings juggernaut . "
 
Who called American Idol " the most impactful show in the history of television " ?
 
Prediction :  a rival TV executive
Groud Truth :  a rival TV executive


In [None]:
overlap=0
predicted=0
truth=0

for test in test_data:
    U = encoder(test[0],test[1])
    s,e,entropies = decoder(U)
    
    pred_span = list(range(s.data[0],e.data[0]+1))
    truth_span = list(range(test[2].squeeze(0).data[0],test[3].squeeze(0).data[0]+1))
    overlap+=len(set(truth_span) & set(pred_span))
    predicted+=len(pred_span)
    truth+=len(truth_span)
#     break
precision = overlap/predicted
recall = overlap/truth

f1_score = 2*precision*recall/(precision+recall)
print(f1_score)

In [55]:
import datetime
cdate = datetime.datetime.strftime(datetime.datetime.now(),"%m_%d")

In [56]:
if USE_CUDA:
    encoder = encoder.cpu()
    decoder = decoder.cpu()
torch.save(encoder.state_dict(), 'models/enc_params_'+cdate+'.pkl')
torch.save(decoder.state_dict(), 'models/dec_params_'+cdate+'.pkl')