In [1]:
import argparse
import copy, json, os

import torch
from torch import nn, optim
#from tensorboardX import SummaryWriter
from time import gmtime, strftime

from model.transformer import BiDAF
from model.data import SQuAD
from model.ema import EMA
import evaluate
from tqdm import tqdm_notebook

In [2]:
train_file = 'train-v1.1.json'
dev_file = 'dev-v1.1.json'
prediction_file = 'prediction.json'
path_to_data = './data'
MAX_VOCAB = 10000
NUM_SAMPLES_TRAIN = 30
NUM_SAMPLES_DEV = 10
TRAIN_BATCH_SIZE = 3
DEV_BATCH_SIZE = 10


data = SQuAD(path_to_data, train_file, dev_file, 
             MAX_VOCAB, NUM_SAMPLES_TRAIN, NUM_SAMPLES_DEV,
            TRAIN_BATCH_SIZE, DEV_BATCH_SIZE, glove_tokens = '6B')

preprocessing data files...
loading splits...
building vocab...
building iterators...


In [3]:
import math
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = BiDAF(len(data.CHAR.vocab),len(data.WORD.vocab), data.WORD.vocab.vectors, char_channel_size = 100, word_dim = 100, hidden_size = 96, dropout_rate = 0.05).to(device)
parameters = filter(lambda p: p.requires_grad, model.parameters())

lr = 0.001
lr_warm_up_num = 1000

optimizer = optim.Adam(parameters, lr = lr)

cr = lr / math.log2(lr_warm_up_num)
scheduler = optim.lr_scheduler.LambdaLR(
    optimizer,
    lr_lambda=lambda ee: cr * math.log2(ee + 1) if ee < lr_warm_up_num else lr)


criterion = nn.CrossEntropyLoss()
model.train()
loss, last_epoch = 0, -1
max_dev_exact, max_dev_f1 = -1, -1
NUM_EPOCHS = 12
validation_frequency = 500
iterator = data.train_iter

In [4]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
count_parameters(model)

1394644

for x in data.train_iter:
    break
model(x)

In [5]:
def test(model, prediction_file, data):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    criterion = nn.CrossEntropyLoss()
    loss = 0
    answers = dict()
    model.eval()
    print('Validating.....')
    with torch.set_grad_enabled(False):
        for batch in tqdm_notebook(iter(data.dev_iter), total = len(data.dev_iter)):
            p1, p2 = model(batch)
            batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
            loss += batch_loss.item()

            # (batch, c_len, c_len)
            batch_size, c_len = p1.size()
            ls = nn.LogSoftmax(dim=1)
            mask = (torch.ones(c_len, c_len) * float('-inf')).to(device).tril(-1).unsqueeze(0).expand(batch_size, -1, -1)
            score = (ls(p1).unsqueeze(2) + ls(p2).unsqueeze(1)) + mask
            score, s_idx = score.max(dim=1)
            score, e_idx = score.max(dim=1)
            s_idx = torch.gather(s_idx, 1, e_idx.view(-1, 1)).squeeze()

            for i in range(batch_size):
                id = batch.id[i]
                answer = batch.c_word[0][i][s_idx[i]:e_idx[i]+1]
                answer = ' '.join([data.WORD.vocab.itos[idx] for idx in answer])
                answers[id] = answer

    with open(prediction_file, 'w', encoding='utf-8') as f:
        print(json.dumps(answers), file=f)

    results = evaluate.main(path_to_data + '/' + dev_file, prediction_file)
    return loss/len(data.dev_iter), results['exact_match'], results['f1']

In [6]:
accumulation_steps = 2

for epoch in range(NUM_EPOCHS):
    print('Epoch:', epoch + 1)
    optimizer.zero_grad()
    for i, batch in tqdm_notebook(enumerate(iterator), total = len(iterator)):

        p1, p2 = model(batch)

        batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
        loss += batch_loss.item()
        batch_loss = batch_loss / accumulation_steps
        batch_loss.backward()
        
        if (i+1) % accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()



        if (i + 1) % validation_frequency == 0:
            loss /= validation_frequency
            dev_loss, dev_exact, dev_f1 = test(model, prediction_file, data)

            print('train loss:  {:.3f} | dev loss:  {:.3f} | dev EM: {:.3f} | dev F1: {:.3f}'.format(loss, dev_loss, dev_exact, dev_f1))
            loss = 0
            
            if dev_f1 > max_dev_f1:
                print('Saving model....')
                max_dev_f1 = dev_f1
                max_dev_exact = dev_exact
                if not os.path.exists('saved_models'):
                    os.makedirs('saved_models')
                torch.save(model.state_dict(), 'saved_models/BiDAF_best_model.pt')

                
            model.train()


Epoch: 1


HBox(children=(IntProgress(value=0, max=3339), HTML(value='')))

Validating.....


HBox(children=(IntProgress(value=0, max=911), HTML(value='')))


train loss:  13.539 | dev loss:  11.528 | dev EM: 0.000 | dev F1: 0.928
Saving model....
Validating.....


HBox(children=(IntProgress(value=0, max=911), HTML(value='')))


train loss:  13.196 | dev loss:  11.232 | dev EM: 0.038 | dev F1: 1.420
Saving model....
Validating.....


HBox(children=(IntProgress(value=0, max=911), HTML(value='')))


train loss:  13.031 | dev loss:  10.996 | dev EM: 0.038 | dev F1: 1.762
Saving model....


RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 2.95 GiB total capacity; 377.75 MiB already allocated; 128.00 KiB free; 1.37 GiB cached)

In [None]:
batch

In [None]:
for i, batch in tqdm_notebook(enumerate(iterator), total = len(iterator)):
    break

In [None]:
batch.c_word[0]