In [1]:
import torch
import torch.nn as nn
import math
import numpy as np
import copy
from torch.autograd import Variable
import torch.nn.functional as F
import time
from attention import clones, future_mask, MultiHeadedAttention, FeedForwardNetwork, Embeddings, PositionalEncoding, Generator, LayerNorm, SublayerConnection, Encoder, Decoder, EncoderDecoder, make_model, Batch

In [2]:
vocab_size = 10
batch_size = 16
nbatches = 20
d_model = 512

In [3]:
class zrange:
    def __init__(self,batch_size, n_batches, vocab_size, seq_len):
        self.data = torch.from_numpy(np.random.randint(1, vocab_size, size=(batch_size*nbatches, seq_len)))
        self.batch_size = batch_size
        self.n_batches = n_batches
        self.seq_len = seq_len

    def __iter__(self):
        return zrange_iter(self.n_batches, self.batch_size, self.data, self.seq_len)

class zrange_iter:
    def __init__(self, n_batches, batch_size, data, seq_len):
        self.i = 0
        self.n_batches = n_batches
        self.batch_size = batch_size
        self.data = data
        self.seq_len = seq_len

    def __iter__(self):
        # Iterators are iterables too.
        # Adding this functions to make them so.
        return self

    def __next__(self):
        if self.i < self.n_batches:
            idx = self.i*self.batch_size
            src = Variable(self.data[idx:idx+batch_size,:], requires_grad=False)
            tgt = Variable(self.data[idx:idx+batch_size,:], requires_grad=False)
            self.i += 1
            return Batch(src, tgt, 0)
        else:
            raise StopIteration()

In [4]:
z = zrange(batch_size, nbatches, vocab_size, 10)

In [5]:
list(z)

[<attention.Batch at 0x141275a00>,
 <attention.Batch at 0x1411f8da0>,
 <attention.Batch at 0x110d10f50>,
 <attention.Batch at 0x1411f8b60>,
 <attention.Batch at 0x110d10dd0>,
 <attention.Batch at 0x140a3f200>,
 <attention.Batch at 0x140c247d0>,
 <attention.Batch at 0x1412752b0>,
 <attention.Batch at 0x1412755b0>,
 <attention.Batch at 0x141275a30>,
 <attention.Batch at 0x141275a60>,
 <attention.Batch at 0x141275610>,
 <attention.Batch at 0x141275af0>,
 <attention.Batch at 0x141275370>,
 <attention.Batch at 0x141275bb0>,
 <attention.Batch at 0x141275b80>,
 <attention.Batch at 0x141275c40>,
 <attention.Batch at 0x141275c10>,
 <attention.Batch at 0x141275a90>,
 <attention.Batch at 0x141275c70>]

In [6]:
data = torch.from_numpy(np.random.randint(1, vocab_size, size=(1,batch_size*nbatches, d_model)))
data[:, 0] = 1

In [7]:
## testing make_model
model = make_model(vocab_size, vocab_size)


mha h, d_k, d_model:  8 64 512


In [8]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
        
def get_std_opt(model):
    return NoamOpt(model.src_embed[0].d_model, 2, 4000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

In [23]:
def run_epoch(data_iter, model, loss_compute):
    "Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    for i, batch in enumerate(data_iter):
        out = model.forward(batch.src, batch.trg, 
                            batch.src_mask, batch.trg_mask)
        #print("out shape: ", out.size())
        #print("tgt_y shape: ", batch.trg_y.size())
        loss = loss_compute(out, batch.trg_y, batch.ntokens)
        total_loss += loss
        total_tokens += batch.ntokens.item()
        tokens += batch.ntokens.item()
        if i % 1 == 0:
            elapsed = time.time() - start
            print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
                    (i, loss / batch.ntokens, tokens / elapsed))
            start = time.time()
            tokens = 0
    #print("total_loss: ", total_loss)
    #print("total_tokens: ", total_tokens)
    #return total_loss / total_tokens

In [10]:
class SimpleLossCompute:
    "A simple loss compute and train function."
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt
        
    def __call__(self, x, y, norm):
        # x = self.generator(x)
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)), 
                              y.contiguous().view(-1)) / norm
        loss.backward()
        if self.opt is not None:
            self.opt.step()
            self.opt.optimizer.zero_grad()
        return loss.item() * norm


In [11]:
class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False)
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        
    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1).long(), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, Variable(true_dist, requires_grad=False))

In [22]:
vocab_size = 10
criterion = LabelSmoothing(size=vocab_size, padding_idx=0, smoothing=0.0)
#criterion = nn.CrossEntropyLoss()
model = make_model(vocab_size, vocab_size, N=2)
#data_generator = data_gen(data,vocab_size,batch_size,nbatches,d_model)
data_generator = zrange(batch_size, nbatches, vocab_size,20)
model_opt = NoamOpt(model.src_embed[0].d_model, 1, 400,
        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))


for epoch in range(1):
    print("epoch: ", epoch)
    model.train()
    run_epoch(data_generator, model, 
              SimpleLossCompute(model.generator, criterion, model_opt))
    model.eval()
    print(run_epoch(data_generator, model, 
                    SimpleLossCompute(model.generator, criterion, None)))




mha h, d_k, d_model:  8 64 512
epoch:  0
target shape:  torch.Size([16, 20])
target :  [[3. 4. 2. 7. 1. 6. 3. 5. 6. 7. 4. 2. 7. 9. 1. 2. 9. 3. 5. 0.]
 [9. 6. 3. 4. 1. 2. 7. 2. 2. 9. 2. 3. 6. 4. 1. 7. 1. 3. 3. 0.]
 [6. 2. 7. 9. 2. 4. 5. 2. 3. 8. 9. 5. 8. 9. 5. 3. 9. 7. 2. 0.]
 [6. 5. 6. 8. 2. 9. 2. 5. 8. 5. 8. 2. 9. 6. 6. 8. 1. 2. 5. 0.]
 [2. 7. 5. 2. 2. 8. 1. 5. 8. 2. 2. 3. 4. 2. 5. 1. 6. 4. 7. 0.]
 [9. 3. 8. 6. 8. 5. 5. 9. 6. 6. 3. 4. 3. 7. 3. 4. 3. 6. 1. 0.]
 [8. 9. 3. 9. 7. 1. 5. 9. 8. 1. 6. 3. 9. 5. 1. 9. 7. 2. 5. 0.]
 [2. 9. 3. 2. 4. 8. 6. 5. 4. 3. 9. 5. 3. 2. 6. 5. 8. 8. 2. 0.]
 [5. 2. 7. 2. 1. 6. 7. 2. 1. 7. 5. 1. 9. 4. 7. 9. 4. 4. 9. 0.]
 [8. 6. 2. 4. 6. 7. 6. 5. 7. 5. 4. 7. 7. 1. 9. 2. 1. 2. 8. 0.]
 [8. 3. 7. 9. 9. 1. 7. 3. 8. 3. 1. 3. 1. 3. 8. 2. 6. 1. 1. 0.]
 [1. 1. 6. 4. 6. 9. 2. 5. 4. 6. 4. 8. 3. 1. 1. 8. 5. 9. 3. 0.]
 [8. 8. 3. 9. 9. 6. 6. 5. 2. 6. 1. 3. 2. 5. 8. 9. 7. 4. 3. 0.]
 [1. 7. 4. 4. 4. 2. 7. 2. 2. 1. 6. 3. 7. 5. 9. 3. 7. 7. 2. 0.]
 [4. 3. 5. 9. 3. 6. 5. 4. 8. 7.

KeyboardInterrupt: 

In [13]:
def greedy_decode(model, source, source_mask, max_decode_len, start_symbol):
    encoder_outputs = model.encode(source, source_mask) # todo(annhe): check source_masking
    print("encoder outputs shape: ", encoder_outputs.size())
    ys = torch.ones(1,1).fill_(start_symbol).type_as(source.data) # create a 1x1 array with element '<sos>'
    for i in range(max_decode_len-1):
        print("i: ", i)
        print("source mask shape: ", source_mask.size())
        print("ys shape: ", ys.size())
        print("source shape: ", source.size())
        output_mask = torch.Tensor(future_mask(ys.size(1)).type_as(src.data))
        print("output mask shape: ", output_mask.size())
        output = model.decode(encoder_outputs, source_mask, torch.Tensor(ys),
                              output_mask)
        print(output.size())
        # output is size N x L x D
        # we need to pass it through the generator
        prob = model.generator(output[:,-1,:])
        print(prob.size())
        _, vocab_symbol = torch.max(prob, dim=1)
        vocab_symbol = vocab_symbol.detach().unsqueeze(0)
        ys = torch.cat([ys, vocab_symbol], dim=1)
        #print(ys.size())
        #print(ys.size(1))
model.eval()
src = Variable(torch.LongTensor([[1,2,3,4,5,6,7]]) )
src_mask = Variable(torch.ones(1, 1, 7) )
print(greedy_decode(model, src, src_mask, max_decode_len=7, start_symbol=1))
    

mha query shape:  torch.Size([1, 7, 512])
mha key shape:  torch.Size([1, 7, 512])
mha value shape:  torch.Size([1, 7, 512])
attention query size:  torch.Size([1, 7, 64])
attention key size:  torch.Size([1, 7, 64])
attention value size:  torch.Size([1, 7, 64])
attention mask size:  torch.Size([1, 1, 7])
attention query shape:  torch.Size([1, 7, 64])
attention key tranpose shape:  torch.Size([1, 64, 7])
mask size:  torch.Size([1, 1, 7])
attention_weights size  torch.Size([1, 7, 7])
attention query size:  torch.Size([1, 7, 64])
attention key size:  torch.Size([1, 7, 64])
attention value size:  torch.Size([1, 7, 64])
attention mask size:  torch.Size([1, 1, 7])
attention query shape:  torch.Size([1, 7, 64])
attention key tranpose shape:  torch.Size([1, 64, 7])
mask size:  torch.Size([1, 1, 7])
attention_weights size  torch.Size([1, 7, 7])
attention query size:  torch.Size([1, 7, 64])
attention key size:  torch.Size([1, 7, 64])
attention value size:  torch.Size([1, 7, 64])
attention mask siz

In [14]:
test_embedding = nn.Embedding(10, 5)
test_input = np.array([0,1,9])
embedded = test_embedding(torch.Tensor(test_input).long())

In [15]:
print(embedded)

tensor([[-0.6066,  1.6321, -0.7600, -1.2304, -0.5890],
        [ 0.6416, -0.3271,  1.3717,  0.2169,  0.8318],
        [-2.0341, -1.1354,  0.1077,  0.5974, -0.1197]],
       grad_fn=<EmbeddingBackward0>)


In [16]:
ys = torch.ones(1,1).fill_(1)

In [17]:
print(ys)


tensor([[1.]])


In [18]:
new_ys = torch.ones(1)

In [19]:
print(new_ys)

tensor([1.])


In [20]:
src = Variable(torch.LongTensor([[1,2,3,4,5,6,7,8,9,9]]) )
src_mask = Variable(torch.ones(1, 1, 10) )
encoder_outputs = model.encode(src, src_mask)

mha query shape:  torch.Size([1, 10, 512])
mha key shape:  torch.Size([1, 10, 512])
mha value shape:  torch.Size([1, 10, 512])
attention query size:  torch.Size([1, 10, 64])
attention key size:  torch.Size([1, 10, 64])
attention value size:  torch.Size([1, 10, 64])
attention mask size:  torch.Size([1, 1, 10])
attention query shape:  torch.Size([1, 10, 64])
attention key tranpose shape:  torch.Size([1, 64, 10])
mask size:  torch.Size([1, 1, 10])
attention_weights size  torch.Size([1, 10, 10])
attention query size:  torch.Size([1, 10, 64])
attention key size:  torch.Size([1, 10, 64])
attention value size:  torch.Size([1, 10, 64])
attention mask size:  torch.Size([1, 1, 10])
attention query shape:  torch.Size([1, 10, 64])
attention key tranpose shape:  torch.Size([1, 64, 10])
mask size:  torch.Size([1, 1, 10])
attention_weights size  torch.Size([1, 10, 10])
attention query size:  torch.Size([1, 10, 64])
attention key size:  torch.Size([1, 10, 64])
attention value size:  torch.Size([1, 10,

In [21]:
output_mask = future_mask(new_ys.size(0))
output = model.decode(new_ys,encoder_outputs, src_mask,
                              output_mask)

decoder x shape:  torch.Size([1, 1, 10, 512])
decoder encoder_outputs shape:  torch.Size([1])
decoder layer m (encoder outputs) size:  torch.Size([1])
mha query shape:  torch.Size([1, 1, 10, 512])
mha key shape:  torch.Size([1, 1, 10, 512])
mha value shape:  torch.Size([1, 1, 10, 512])
attention query size:  torch.Size([1, 1, 10, 64])
attention key size:  torch.Size([1, 1, 10, 64])
attention value size:  torch.Size([1, 1, 10, 64])
attention mask size:  torch.Size([1, 1, 1])
attention query shape:  torch.Size([1, 1, 10, 64])
attention key tranpose shape:  torch.Size([1, 1, 64, 10])
mask size:  torch.Size([1, 1, 1])
attention_weights size  torch.Size([1, 1, 10, 10])
attention query size:  torch.Size([1, 1, 10, 64])
attention key size:  torch.Size([1, 1, 10, 64])
attention value size:  torch.Size([1, 1, 10, 64])
attention mask size:  torch.Size([1, 1, 1])
attention query shape:  torch.Size([1, 1, 10, 64])
attention key tranpose shape:  torch.Size([1, 1, 64, 10])
mask size:  torch.Size([1,

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x1 and 512x64)

In [None]:
print(new_ys)


In [None]:
print(new_ys.long())