In [1]:
import args
import util
import layers
import models

In [2]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler as sched
import torch.utils.data as data
import util

from args import get_train_args
from collections import OrderedDict
from json import dumps
from models import Seq2Seq
from tqdm import tqdm
from ujson import load as json_load
from util import collate_fn, SQuAD

In [3]:
save_dir = './save/'
name = "testrun"
save_dir = util.get_save_dir(save_dir, name, training=True)
print(save_dir)

./save/train\testrun-15


In [4]:
# Set up logging and devices
log = util.get_logger(save_dir, name)

In [5]:
device, gpu_ids = util.get_available_devices()
print(device)
print(gpu_ids)

cpu
[]


In [6]:
batch_size=64
batch_size *= max(1, len(gpu_ids))
print(batch_size)

64


In [7]:
# Set random seed
seed=224
log.info(f'Using random seed {seed}...')
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

[03.01.20 14:10:01] Using random seed 224...


In [8]:
# Get embeddings
log.info('Loading embeddings...')
word_emb_file= './data/word_emb.json'
word_vectors = util.torch_from_json(word_emb_file)

[03.01.20 14:10:01] Loading embeddings...


In [9]:
# Get model
log.info('Building model...')
hidden_size=100
drop_prob=0.2
model = Seq2Seq(word_vectors=word_vectors,
                hidden_size=hidden_size,
                output_size=word_vectors.size(0),
                drop_prob=drop_prob)
model = nn.DataParallel(model, gpu_ids)

[03.01.20 14:10:02] Building model...


In [10]:
step = 0
ema_decay = 0.999
model = model.to(device)
model.train()
ema = util.EMA(model, ema_decay)

In [11]:
 # Get saver
max_checkpoints=5
metric_name='PPL'
maximize_metric=False
saver = util.CheckpointSaver(save_dir,
                             max_checkpoints=5,
                             metric_name=metric_name,
                             maximize_metric=maximize_metric,
                             log=log)


[03.01.20 14:10:02] Saver will minimize PPL...


In [12]:
# Get optimizer and scheduler
lr = 0.5
l2_wd = 0
optimizer = optim.Adam(model.parameters(), lr,
                           weight_decay=l2_wd)
                               
scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

In [13]:
# Get data loader
train_record_file = './data/sample/train.npz'
dev_record_file = './data/sample/dev.npz'
use_squad_v2 = False
num_workers = 4

log.info('Building dataset...')
train_dataset = SQuAD(train_record_file, use_squad_v2)
train_loader = data.DataLoader(train_dataset,
                               batch_size=batch_size,
                               shuffle=True,
                               num_workers=num_workers,
                               collate_fn=collate_fn)
dev_dataset = SQuAD(dev_record_file, use_squad_v2)
dev_loader = data.DataLoader(dev_dataset,
                             batch_size=batch_size,
                             shuffle=False,
                             num_workers=num_workers,
                             collate_fn=collate_fn)

[03.01.20 14:10:02] Building dataset...


In [14]:
for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)
print(cw_idxs.shape)
print(cw_idxs)

print(qw_idxs.shape)
print(qw_idxs)

torch.Size([24, 361])
tensor([[ 1067,  5169,  1603,  ...,     0,     0,     0],
        [ 4189,    14,   133,  ...,     0,     0,     0],
        [   93,     6,  1961,  ...,    33,    94,     5],
        ...,
        [  762,  6029,  7159,  ...,     0,     0,     0],
        [21169,    33, 15254,  ...,     0,     0,     0],
        [ 7563,   333,    59,  ...,     0,     0,     0]])
torch.Size([24, 26])
tensor([[    2,   192,  2180,    34,   905,     8,  1457,     6,  5169,  1528,
            28,  1007,    41,     3,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [    2,   192,    14,     6,   276,    54,  6502,   432,  5623,     9,
           133,   443,    41,     3,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [    2,   977,    34,     6,  1716,     9,     6,  2756,     9,   911,
            19, 19268,  1518,   214,    11, 17631,    41,     3,     0,     0,
             0,  

In [15]:
#Forward
log_p = model(cw_idxs, qw_idxs)

c_mask shape:
torch.Size([24, 361])
q_mask shape:
torch.Size([24, 25])
o_t shape:
torch.Size([24, 1, 100])
decoder_outputs shape:
torch.Size([24, 25, 100])
logits shape:
torch.Size([24, 25, 23716])
q_mask shape:
torch.Size([24, 25, 23716])


In [16]:
log_p = log_p.contiguous().view(log_p.size(0) * log_p.size(1), log_p.size(2))
print(log_p.shape)

torch.Size([600, 23716])


In [21]:
qw_idxs_target = qw_idxs[:, 1:]
qw_idxs_target = qw_idxs_target.contiguous().view(qw_idxs_target.size(0) * qw_idxs_target.size(1))
print("qw_idxs_target shape:")
print(qw_idxs_target.shape)
loss = F.nll_loss(log_p, qw_idxs_target, ignore_index=0, reduction='sum')

qw_idxs_target shape:
torch.Size([600])


In [22]:
print(loss.item())

3132.04931640625


In [25]:
cum_loss = loss.item()
q_mask = torch.zeros_like(qw_idxs_target) != qw_idxs_target
q_len = q_mask.sum(-1)
tgt_word_num_to_predict = torch.sum(q_len).item()
cum_tgt_words = tgt_word_num_to_predict

In [26]:
print(cum_tgt_words)

311


In [27]:
print(cum_loss)

3132.04931640625


In [28]:
np.exp(cum_loss / cum_tgt_words)

23644.79117808982