In [3]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from queue import PriorityQueue
import numpy as np
import torchtext
import tqdm
from torchnlp.metrics import get_moses_multi_bleu
from torchtext.data import Field, BucketIterator
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu

import tensorflow as tf
import tensorflow_datasets as tfds
from tokenize import tokenize, untokenize, NUMBER, STRING, NAME, OP
from io import BytesIO

import linecache
import sys
import os
import re
import random
import time
import operator

In [4]:
!nvidia-smi

Thu Nov  7 14:28:53 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.26       Driver Version: 430.26       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  TITAN RTX           Off  | 00000000:DA:00.0 Off |                  N/A |
| 40%   27C    P8     1W / 280W |      0MiB / 24220MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

## Loading the dataset

In [5]:
try:
    os.mkdir("./datasets")
except FileExistsError:
    print("Directories already exists")

# getting descriptions
!wget https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.anno -O ./datasets/all.desc

# getting code
!wget https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.code -O ./datasets/all.code

Directories already exists
--2019-11-07 14:28:56--  https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.anno
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.16.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.16.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1382085 (1.3M) [text/plain]
Saving to: './datasets/all.desc'


2019-11-07 14:28:56 (17.2 MB/s) - './datasets/all.desc' saved [1382085/1382085]

--2019-11-07 14:28:56--  https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.code
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.16.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.16.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 906732 (885K) [text/plain]
Saving to: './datasets/all.code'


2019-11-07 14:28:57 (10.4 MB/s) - './datasets/all.code' saved [906732

## Creating a token text encoder
An encoder will take a file and a splitting function and return an object able to encode and decode a string. It will also be able to save a vocab file and retrieve from file.

In [6]:
text = " append rel_to to string 'ForeignKey, (substitute the result for field_type.)"

# looks like code split need parenthesis to be matched in the same string, if not it gives an error...
def code_split(s):
    return [x.string for x in tokenize(BytesIO(s.encode('utf-8')).readline) if x.string != '' and x.string != "\n" and not x.string.isspace()][1:]

print(code_split(text))

['append', 'rel_to', 'to', 'string', "'", 'ForeignKey', ',', '(', 'substitute', 'the', 'result', 'for', 'field_type', '.', ')']


In [7]:
text = " append rel_to to string 'ForeignKey, (subs__titute the result' for field_type."

def string_split(s):
    return list(filter(lambda x: x != '' and x != "\n" and not x.isspace(), re.split('(_|\W)', s))) # this will chunk all code properly by plits strings with quotes
#     return list(filter(lambda x: x != '' and x != "\n" and not x.isspace(), re.split('(\\\'.*?\\\'|\\\".*?\\\"|_|\W)', s))) # this keeps the strings intact

print(string_split(text))

['append', 'rel', '_', 'to', 'to', 'string', "'", 'ForeignKey', ',', '(', 'subs', '_', '_', 'titute', 'the', 'result', "'", 'for', 'field', '_', 'type', '.']


## Making the input pipeline

In [8]:
def corpus_to_array(src_fp, tgt_fp):
    lines = []
    with open(src_fp, "r") as src_file, open(tgt_fp, "r") as tgt_file:
        for src, tgt in zip(src_file, tgt_file):
            lines.append((src, tgt))
    return lines

In [9]:
def filter_corpus(data, max_seq_length=200, tokenizer=string_split):
    return [(src, tgt) for src, tgt in data if len(string_split(src)) <= max_seq_length and len(string_split(tgt)) <= max_seq_length]

In [10]:
def samples_to_dataset(samples, src_field, tgt_field):
    """
    Args:
        samples: [(src_string),(tgt_string)]
        src/tgt_tokenizer: a func that takes a string and returns an array of strings
    """
    examples = []
    
    for sample in samples:
        src_string, tgt_string = sample
        examples.append(torchtext.data.Example.fromdict({"src":src_string, "tgt":tgt_string}, 
                                        fields={"src":("src",src_field), "tgt":("tgt",tgt_field)}))
        
    dataset = torchtext.data.Dataset(examples,fields={"src":src_field, "tgt":tgt_field})
    return dataset

In [11]:
data = corpus_to_array("datasets/all.desc", "datasets/all.code")
random.shuffle(data)

In [12]:
print("Max src length:", max([len(string_split(src)) for src, tgt in data]))
print("Max tgt length:", max([len(string_split(tgt)) for src, tgt in data]))

Max src length: 586
Max tgt length: 1087


In [13]:
print("Full dataset size:", len(data))
max_seq_length=200
data = filter_corpus(data, max_seq_length=200, tokenizer=string_split)
print("Limited dataset size:", len(data))

Full dataset size: 18805
Limited dataset size: 18781


In [14]:
SRC_TEXT = Field(sequential=True, tokenize=string_split, init_token='<sos>',eos_token='<eos>')
TGT_TEXT = Field(sequential=True, tokenize=string_split, init_token='<sos>',eos_token='<eos>')

dataset = samples_to_dataset(data, SRC_TEXT, TGT_TEXT)

train_dataset, val_dataset = dataset.split([0.9,0.1])

## Debugging dataset
This will be a small dataset of only 3 or 4 sentences to ensure the model can overfit.

In [15]:
# debug_data = [
#     ("my favourite foods are banana and toast","would you like banana and toast ?"),
#     ("my favourite foods are eggs and bacon and beans","would you like eggs and bacon and beans ?"),
#     ("my favourite food is chocolate","would you like chocolate ?"),
#     ("my favourite food is avocado","would you like avocado ?")
# ]

# other_data = [
#     ("what age is she ?", "she is 8 years old"),
#     ("what age is he ?", "he is 4 years old"),
#     ("how old are you ?", "i am 22 years old"),
#     ("how old am i ?", "you are 28 years old")
# ]

# SRC_TEXT = Field(sequential=True, tokenize=string_split, init_token='<sos>',eos_token='<eos>')
# TGT_TEXT = Field(sequential=True, tokenize=string_split, init_token='<sos>',eos_token='<eos>')

# train_dataset = val_dataset = samples_to_dataset(other_data, SRC_TEXT, TGT_TEXT)

# # train_dataset, val_dataset = dataset.split([0.7,0.3])

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(0) # choose GPU from nvidia-smi 
print("Using:", device)

Using: cuda


In [17]:
SRC_TEXT.build_vocab(train_dataset)
TGT_TEXT.build_vocab(train_dataset)


sample = dataset[2].src
for tok, id in zip(sample, SRC_TEXT.numericalize([sample])):
    print("{} -> {}".format(tok, id.numpy()[0]))

define -> 30
method -> 16
add -> 101
_ -> 5
arguments -> 24
with -> 9
self -> 10
class -> 31
instance -> 59
and -> 14
parser -> 102
as -> 41
arguments -> 24
. -> 4


In [18]:
batch_size = 32

train_iterator = BucketIterator(
    train_dataset,
    batch_size = batch_size,
    repeat=True,
#     shuffle=True,
    sort_key = lambda x: len(x.src)+len(x.tgt),
    device = device)

valid_iterator = BucketIterator(val_dataset,
    batch_size = batch_size,
    sort_key = lambda x: len(x.src)+len(x.tgt),
    device = device)

# The iterator generates batches with padded length for sequences with similar sizes, a batch is [seq_length, batch_size]

for i, batch in enumerate(train_iterator):
    idx = 0
    print([SRC_TEXT.vocab.itos[id] for id in batch.src.cpu().numpy()[:,idx]])
    print(batch.src.cpu().numpy()[:,idx])
    print(batch.tgt.cpu().numpy()[:,idx])
    break

['<sos>', 'pop', 'the', 'value', 'under', 'the', "'", 'file', '_', 'path', "'", 'key', 'of', 'kwargs', 'dictionary', ',', 'substitute', 'it', 'for', 'self', '.', 'file', '_', 'path', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
[  2 917   7  25  50   7   8  76   5  65   8  35  18  89  36   6  21  42
  13  10   4  76   5  65   4   3   1   1   1   1   1   1   1   1   1   1
   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1]
[  2  12   5  64   4  37  10  52   5 233   6   9  64   4  37   9   7   3
   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
   1   1   1]


## Building the model


Sample transformer without positional encoding, it uses the built in transformer model

In [19]:
rand_transformer_model = nn.Transformer() # uses default hyperparameters
src = torch.rand((10, 32, 512)) # [src_seq_length, batch_size, embedding_size]
tgt = torch.rand((20, 32, 512)) # [tgt_seq_length, batch_size, embedding_size]
rand_transformer_model(src, tgt).shape # [tgt_seq_length, batch_size, embedding_size]

torch.Size([20, 32, 512])

In [20]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [21]:
class TransformerModel(nn.Module):

    def __init__(self, src_vocab_size, tgt_vocab_size, embedding_size=512, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        
        self.embedding_size = embedding_size
        self.pos_encoder = PositionalEncoding(embedding_size, dropout)
        self.src_encoder = nn.Embedding(src_vocab_size, embedding_size)
        self.tgt_encoder = nn.Embedding(tgt_vocab_size, embedding_size)
        
        self.transformer = nn.Transformer(d_model=embedding_size, nhead=8, num_encoder_layers=4, num_decoder_layers=4, dim_feedforward=1024)
        self.decoder = nn.Linear(embedding_size, tgt_vocab_size)

        self.init_weights()
        self.tgt_mask = None

    def _generate_square_subsequent_mask(self, sz):
#         noise_e = 0.05 if self.training else 0.0 # this is code to add noise to the decoding process during training
        noise_e = 0.0 if self.training else 0.0
        noise_mask = (torch.rand(sz,sz) > noise_e).float()

        mask = (torch.triu(torch.ones(sz,sz))).transpose(0, 1)
        mask = torch.mul(mask, noise_mask)
        v = (torch.sum(mask, dim=-1) == 0).float()

        fix_mask = torch.zeros(sz,sz)
        fix_mask[:,0] = 1.0
        v = v.repeat(sz, 1).transpose(0,1)
        fix_mask = torch.mul(fix_mask,v)

        mask += fix_mask
        
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.src_encoder.weight.data.uniform_(-initrange, initrange)
        self.tgt_encoder.weight.data.uniform_(-initrange, initrange)
        
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, tgt):
        self.tgt_mask = self._generate_square_subsequent_mask(len(tgt)).to(device)

        src = self.src_encoder(src) * math.sqrt(self.embedding_size)
        src = self.pos_encoder(src)
        
        tgt = self.tgt_encoder(tgt) * math.sqrt(self.embedding_size)
        tgt = self.pos_encoder(tgt)
        
        output = self.transformer(src, tgt, tgt_mask=self.tgt_mask)
        output = self.decoder(output)
        return output

In [22]:
src_vocab_size = len(SRC_TEXT.vocab.itos)
tgt_vocab_size = len(TGT_TEXT.vocab.itos)

model = TransformerModel(src_vocab_size, tgt_vocab_size, dropout=0.2).to(device) 

In [23]:
model._generate_square_subsequent_mask(4)

tensor([[0., -inf, -inf, -inf],
        [0., 0., -inf, -inf],
        [0., 0., 0., -inf],
        [0., 0., 0., 0.]])

In [24]:
noise_e = 0.5
noise_mask = (torch.rand(10,10) > noise_e).float()

mask = (torch.triu(torch.ones(10,10))).transpose(0, 1)
mask = torch.mul(mask, noise_mask)
v = (torch.sum(mask, dim=-1) == 0).float()

fix_mask = torch.zeros(10,10)
fix_mask[:,0] = 1.0
v = v.repeat(10, 1).transpose(0,1)
fix_mask = torch.mul(fix_mask,v)

print(fix_mask)
mask += fix_mask


mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
mask

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])


tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [-inf, 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [-inf, 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [-inf, -inf, -inf, 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [-inf, 0., 0., -inf, 0., 0., -inf, -inf, -inf, -inf],
        [0., -inf, -inf, -inf, 0., 0., 0., -inf, -inf, -inf],
        [0., -inf, 0., -inf, 0., -inf, 0., -inf, -inf, -inf],
        [0., -inf, -inf, 0., 0., 0., 0., -inf, 0., -inf],
        [0., 0., 0., 0., -inf, 0., -inf, -inf, -inf, 0.]])

In [25]:
model.transformer.generate_square_subsequent_mask(4)

tensor([[0., 0., 0., 0.],
        [-inf, 0., 0., 0.],
        [-inf, -inf, 0., 0.],
        [-inf, -inf, -inf, 0.]])

In [26]:
(torch.tensor([float('-inf'),0])).softmax(-1)

tensor([0., 1.])

In [27]:
src = torch.randint(0, src_vocab_size, (10,8), device=device) # [src_seq_length, batch_size]
tgt = torch.randint(0, tgt_vocab_size, (33,8), device=device) # [src_seq_length, batch_size]
model(src, tgt).shape 

torch.Size([33, 8, 4221])

In [28]:
def greedy_decode_batch_ids(encoder_input, max_seq_length=50):
    batch_len = encoder_input.shape[1]
    sos_id = TGT_TEXT.vocab.stoi["<sos>"]
    decoder_input = torch.zeros((1, batch_len), dtype=torch.long, device=device).fill_(sos_id)

    for i in range(max_seq_length):
        output = model(encoder_input, decoder_input)
        last_pred = output[-1:].argmax(dim=2)

        decoder_input = torch.cat((decoder_input, last_pred))
    return decoder_input

In [29]:
class BeamSearchNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length):
        '''
        :param hiddenstate:
        :param previousNode:
        :param wordId:
        :param logProb:
        :param length:
        '''
        self.h = hiddenstate
        self.prevNode = previousNode
        self.wordid = wordId
        self.logp = logProb
        self.leng = length

    def eval(self, alpha=1.0):
        reward = 0
        # Add here a function for shaping a reward

        return self.logp / float(self.leng - 1 + 1e-6) + alpha * reward

In [30]:
%load_ext line_profiler

In [31]:
def sum_of_lists(N):
    total = 0
    for i in range(5):
        L = [j ^ (j >> i) for j in range(N)]
        total += sum(L)
    return total

In [32]:
%lprun -f beam_decode beam_decode(model, batch_size=1, encoder_states=src_ids)

NameError: name 'src_ids' is not defined

In [33]:
torch.tensor([[1,2,3],[4,5,6]])[:, :2].view(-1,2)

tensor([[1, 2],
        [4, 5]])

In [34]:
for a, b in torch.tensor([[1,2],[3,4]]):
    print(a, b)

tensor(1) tensor(2)
tensor(3) tensor(4)


In [43]:
def beam_decode(model, encoder_states):
    '''
    :param target_tensor: target indexes tensor of shape [B, T] where B is the batch size and T is the maximum length of the output sentence
    :param decoder_hidden: input tensor of shape [1, B, H] for start of the decoding
    :param encoder_outputs: if you are using attention mechanism you can pass encoder outputs, [T, B, H] where T is the maximum length of input sentence
    :return: decoded_batch
    '''

    beam_width = 10
    topk = 3  # how many sentence do you want to generate
    decoded_batch = []
    
    batch_size = encoder_states.shape[1]
    
    SOS_token = TGT_TEXT.vocab.stoi["<sos>"]
    EOS_token = TGT_TEXT.vocab.stoi["<eos>"]
    MAX_LENGTH = 7

    # decoding goes all batches at the same time
    encoder_input = encoder_states

    # Start with the start of the sentence token
    decoder_input = torch.LongTensor([[SOS_token]*batch_size]).to(device)

    # Number of sentence to generate
    batch_endnodes = [[] for i in range(batch_size)]
    number_required = topk

    # starting node -  hidden vector, previous node, word id, logp, length
    batch_node = [BeamSearchNode(decoder_input, None, SOS_token, 0, 1) for i in range(batch_size)]
    batch_nodes = [PriorityQueue() for i in range(batch_size)]

    # start the queue
    for nodes, node in zip(batch_nodes, batch_node):
        nodes.put((-node.eval(), node))
        
    batch_qsize = [1 for i in range(batch_size)]

    # start beam search
    while True:
        # give up when decoding takes too long for the first batch, placeholder for now
        print(batch_qsize)
        if batch_qsize[0] > 200: break

        # fetch the best node
        best_nodes = [nodes.get() for nodes in batch_nodes]
#         score, n = nodes.get()
#             decoder_input = n.wordid
#         decoder_input = n.h
        
        finished_nodes = [True if n.wordid == EOS_token and n.prevNode != None else False for (score, n) in best_nodes]
        print(finished_nodes)
        
        working_nodes = []
        working_node_id = 0
        working_nodes_idx = []
        for endnodes, (score, n) in zip(batch_endnodes, best_nodes):
            if n.wordid == EOS_token and n.prevNode != None and len(endnodes) < number_required:
                endnodes.append((score, n))
            else:
                working_nodes.append((score, n))
                working_nodes_idx.append(working_node_id)
            working_node_id += 1
        
        if all([len(endnodes) >= number_required for endnodes in batch_endnodes]):
            break
        
        num_working_nodes = len(working_nodes)
        print(working_nodes)
        
        step_encoder_input = encoder_input[:,:num_working_nodes].view(-1,num_working_nodes)
        step_decoder_input = torch.cat([n.h for (score, n) in working_nodes])
#         print("step_encoder_input shape:", step_encoder_input.shape)
        step_decoder_output = model(step_encoder_input, step_decoder_input)
        step_token_logits = step_decoder_output[-1]
        
        print(step_token_logits)

        # PUT HERE REAL BEAM SEARCH OF TOP
        log_prob, indexes = torch.topk(step_token_logits, beam_width)
        print(log_prob.shape)
        
    
        nextnodes = []
        
        for batch_id in working_nodes_idx:
            for new_k in range(beam_width):
                decoded_t = indexes[batch_id][new_k]
                log_p = log_prob[batch_id][new_k].item()
#                 print("decoder_input shape", step_decoder_input.shape)
                print("step_decoder_input shape:", step_decoder_input[:,batch_id].view(-1,1))
                print("decoded_t shape:", decoded_t.view(1,-1))
                decoder_input = torch.cat((step_decoder_input[:,batch_id].view(-1,1),decoded_t.view(1,-1)))
                print("decoder_input shape:", decoder_input.shape)
                node = BeamSearchNode(decoder_input, working_nodes[batch_id], decoded_t.cpu().item(), n.logp + log_p, n.leng + 1)
                score = -node.eval()
                batch_nodes[batch_id].put((score, node))
                batch_qsize[batch_id] += 1
        
        # put them into queue
#         for i in range(len(nextnodes)):
#             score, nn = nextnodes[i]
#             nodes.put((score, nn))
#             # increase qsize
#         qsize += len(nextnodes) - 1

    # choose nbest paths, back trace them
    if len(endnodes) == 0:
        endnodes = [nodes.get() for _ in range(topk)]

    utterances = []
    for score, n in sorted(endnodes, key=operator.itemgetter(0)):
        utterance = []
        utterance.append(n.wordid)
        # back trace
        while n.prevNode != None:
            n = n.prevNode
            utterance.append(n.wordid)

        utterance = utterance[::-1]
        utterances.append(utterance)

    decoded_batch.append(utterances)

    return decoded_batch

sent1 = ["<sos>"] + SRC_TEXT.preprocess('try,') + ["<eos>"] + ["<pad>"]
sent2 = ["<sos>"] + SRC_TEXT.preprocess("if not,") + ["<eos>"]
src_ids = SRC_TEXT.numericalize([sent1, sent2], device=device)
# print("input ids:", src_ids)
[len(x) for x in beam_decode(model, encoder_states=src_ids)[0]]

[1, 1]
[False, False]
[(-0.0, <__main__.BeamSearchNode object at 0x7f50c5c2b908>), (-0.0, <__main__.BeamSearchNode object at 0x7f50c5c2b358>)]
step_encoder_input shape: torch.Size([5, 2])
step_decoder_input shape: torch.Size([2, 2])
tensor([[-2.2347e+00, -1.6264e-01,  1.6521e+00,  ...,  3.6829e-01,
          1.2575e+00,  1.8096e-01],
        [-1.1575e+00,  4.1231e-01,  3.7134e-04,  ..., -2.6348e-01,
         -1.7003e+00, -8.2100e-02]], device='cuda:0', grad_fn=<SelectBackward>)
torch.Size([2, 10])
decoder_input shape: torch.Size([3, 1])
decoder_input shape: torch.Size([3, 1])
decoder_input shape: torch.Size([3, 1])
decoder_input shape: torch.Size([3, 1])
decoder_input shape: torch.Size([3, 1])
decoder_input shape: torch.Size([3, 1])
decoder_input shape: torch.Size([3, 1])
decoder_input shape: torch.Size([3, 1])
decoder_input shape: torch.Size([3, 1])
decoder_input shape: torch.Size([3, 1])
decoder_input shape: torch.Size([3, 1])
decoder_input shape: torch.Size([3, 1])
decoder_input sha

RuntimeError: the batch number of src and tgt must be equal

In [88]:
def beam_decode(model, batch_size, encoder_states):
    '''
    :param target_tensor: target indexes tensor of shape [B, T] where B is the batch size and T is the maximum length of the output sentence
    :param decoder_hidden: input tensor of shape [1, B, H] for start of the decoding
    :param encoder_outputs: if you are using attention mechanism you can pass encoder outputs, [T, B, H] where T is the maximum length of input sentence
    :return: decoded_batch
    '''

    beam_width = 10
    topk = 3  # how many sentence do you want to generate
    decoded_batch = []
    
    SOS_token = TGT_TEXT.vocab.stoi["<sos>"]
    EOS_token = TGT_TEXT.vocab.stoi["<eos>"]
    MAX_LENGTH = 7

    # decoding goes sentence by sentence
    for idx in range(batch_size):
        encoder_input = encoder_states[:, idx].view(-1,1)
        
        # Start with the start of the sentence token
        decoder_input = torch.LongTensor([[SOS_token]]).to(device)

        # Number of sentence to generate
        endnodes = []
        number_required = min((topk + 1), topk - len(endnodes))

        # starting node -  hidden vector, previous node, word id, logp, length
        node = BeamSearchNode(decoder_input, None, SOS_token, 0, 1)
        nodes = PriorityQueue()

        # start the queue
        nodes.put((-node.eval(), node))
        qsize = 1

        # start beam search
        while True:
            # give up when decoding takes too long
            if qsize > 200: break

            # fetch the best node
            score, n = nodes.get()
#             decoder_input = n.wordid
            decoder_input = n.h

            if n.wordid == EOS_token and n.prevNode != None:
                endnodes.append((score, n))
                # if we reached maximum # of sentences required
                if len(endnodes) >= number_required:
                    break
                else:
                    continue

            # decode for one step using decoder
#             decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_output)
#             print(encoder_input)
#             print(decoder_input)
            decoder_output = model(encoder_input, decoder_input)
            last_token_logits = decoder_output[-1]

            # PUT HERE REAL BEAM SEARCH OF TOP
            log_prob, indexes = torch.topk(last_token_logits, beam_width)
            nextnodes = []

            for new_k in range(beam_width):
                decoded_t = indexes[0][new_k]
                log_p = log_prob[0][new_k].item()
                decoder_input = torch.cat((decoder_input,decoded_t.view(1,-1)))
                node = BeamSearchNode(decoder_input, n, decoded_t.cpu().item(), n.logp + log_p, n.leng + 1)
                score = -node.eval()
                nextnodes.append((score, node))

            # put them into queue
            for i in range(len(nextnodes)):
                score, nn = nextnodes[i]
                nodes.put((score, nn))
                # increase qsize
            qsize += len(nextnodes) - 1

        # choose nbest paths, back trace them
        if len(endnodes) == 0:
            endnodes = [nodes.get() for _ in range(topk)]

        utterances = []
        for score, n in sorted(endnodes, key=operator.itemgetter(0)):
            utterance = []
            utterance.append(n.wordid)
            # back trace
            while n.prevNode != None:
                n = n.prevNode
                utterance.append(n.wordid)

            utterance = utterance[::-1]
            utterances.append(utterance)

        decoded_batch.append(utterances)

    return decoded_batch

sent1 = ["<sos>"] + SRC_TEXT.preprocess('try,') + ["<eos>"] + ["<pad>"]
# sent2 = ["<sos>"] + SRC_TEXT.preprocess("if not,") + ["<eos>"]
src_ids = SRC_TEXT.numericalize([sent1], device=device)
# print("input ids:", src_ids)
[len(x) for x in beam_decode(model, batch_size=1, encoder_states=src_ids)]

[20, 18, 20]

In [174]:
a = torch.tensor([1,2,3])
b = torch.tensor([1,2,3])
torch.cat((a,b))

torch.tensor(3).cpu().item()

3

In [205]:
def nltk_bleu(refrence, prediction):
    """
    Implementation from ReCode
    and moses multi belu script sets BLEU to 0.0 if len(toks) < 4
    """
    ngram_weights = [0.25] * min(4, len(refrence))
    return sentence_bleu([refrence], prediction, weights=ngram_weights, 
                          smoothing_function=SmoothingFunction().method3)

nltk_bleu(np.array([1,2,3,4,5,6]), np.array([1,2,5,6]))

0.2740311596835683

In [206]:
def evaluate():
    model.eval() # Turn on the evaluation mode
    total_loss = 0.
    with torch.no_grad():
        sources = []
        results = []
        targets = []
        BLEU_scores = []
        for i, batch in enumerate(valid_iterator):
            encoder_inputs = batch.src
            target = batch.tgt
            predictions = greedy_decode_batch_ids(encoder_inputs, max_seq_length=20)
            
            sources += encoder_inputs.transpose(0,1).cpu().tolist()
            results += predictions.transpose(0,1).cpu().tolist()
            targets += target.transpose(0,1).cpu().tolist()
            if i % 50 == 0:
                print("| EVALUATION | {:5d}/{:5d} batches |".format(i, len(valid_iterator)))
        
        for r_ids, target in zip(results, targets):
            eos_id = TGT_TEXT.vocab.stoi["<eos>"]
            eos_index = r_ids.index(eos_id) if eos_id in r_ids else None
            cut_ids = r_ids[:eos_index]
            filtered_ids = [id for id in cut_ids if id not in [0,1,2,3]]
            filtered_target_ids = [id for id in target if id not in [0,1,2,3]]
            BLEU_scores.append(nltk_bleu(filtered_target_ids, filtered_ids))
        
        with open("out.txt", "w") as out_fp:
            for source, result, target, BLEU in zip(sources, results, targets, BLEU_scores):
                eos_id = TGT_TEXT.vocab.stoi["<eos>"]
                eos_index = result.index(eos_id) if eos_id in result else None
                cut_ids = result[:eos_index]
                filtered_ids = [id for id in cut_ids if id not in [0,1,2,3]]
                filtered_target_ids = [id for id in target if id not in [0,1,2,3]]
                filtered_source_ids = [id for id in source if id not in [0,1,2,3]]
                
                out_fp.write("SRC  :" + " ".join([SRC_TEXT.vocab.itos[id] for id in filtered_source_ids]) + "\n")
                out_fp.write("TGT  :" + " ".join([TGT_TEXT.vocab.itos[id] for id in filtered_target_ids]) + "\n")
                out_fp.write("PRED :" + " ".join([TGT_TEXT.vocab.itos[id] for id in filtered_ids]) + "\n")
                out_fp.write("BLEU :" + str(BLEU) + "\n")
                out_fp.write("\n")
                
        print("| EVALUATION | BLEU: {:5.2f} |".format(np.average(BLEU_scores)))
        

In [207]:
def train_step(batch):
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    tgt_vocab_size = len(TGT_TEXT.vocab.itos)
    encoder_input = batch.src
    decoder_input = batch.tgt[:-1]
    targets = batch.tgt[1:]

    optimizer.zero_grad()
    output = model(encoder_input, decoder_input)

    loss = criterion(output.view(-1, tgt_vocab_size), targets.view(-1))
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()
    elapsed = time.time() - start_time
    return loss

In [208]:
criterion = nn.CrossEntropyLoss(ignore_index=TGT_TEXT.vocab.stoi['<pad>'])
lr = 0.005 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.99)

In [54]:
def train(steps=10000, log_interval=200, learning_interval=4000, eval_interval=1000):
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    step = 1
    for batch in train_iterator:
        loss = train_step(batch)
        total_loss += loss.item()
        
        if step % log_interval == 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| {:5d}/{:5d} steps | '
                  'lr {:02.4f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    step, steps, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        
        if step % eval_interval == 0:
            print("Evaluating model")
            evaluate()
            model.train()
        
        if step % learning_interval == 0:
            scheduler.step()
        
        step += 1
        if step >= steps:
            print("Finished training")
            return

train(steps=1000000,eval_interval=8000,log_interval=200)

|   200/1000000 steps | lr 0.0049 | ms/batch 44.83 | loss  4.11 | ppl    60.92
|   400/1000000 steps | lr 0.0049 | ms/batch 45.08 | loss  4.11 | ppl    61.20
|   600/1000000 steps | lr 0.0049 | ms/batch 46.17 | loss  4.10 | ppl    60.58
|   800/1000000 steps | lr 0.0049 | ms/batch 45.01 | loss  4.07 | ppl    58.77
|  1000/1000000 steps | lr 0.0049 | ms/batch 44.63 | loss  4.07 | ppl    58.62
|  1200/1000000 steps | lr 0.0049 | ms/batch 43.54 | loss  4.02 | ppl    55.96
|  1400/1000000 steps | lr 0.0049 | ms/batch 45.78 | loss  4.07 | ppl    58.60
|  1600/1000000 steps | lr 0.0049 | ms/batch 45.38 | loss  4.00 | ppl    54.63
|  1800/1000000 steps | lr 0.0049 | ms/batch 45.23 | loss  4.02 | ppl    55.71
|  2000/1000000 steps | lr 0.0049 | ms/batch 46.01 | loss  3.99 | ppl    54.26
|  2200/1000000 steps | lr 0.0049 | ms/batch 45.43 | loss  3.97 | ppl    53.13
|  2400/1000000 steps | lr 0.0049 | ms/batch 45.25 | loss  3.95 | ppl    52.08
|  2600/1000000 steps | lr 0.0049 | ms/batch 45.31 |

KeyboardInterrupt: 

In [55]:
torch.save(model.state_dict(), "./saved_model.pytorch")

In [39]:
src_vocab_size = len(SRC_TEXT.vocab.itos)
tgt_vocab_size = len(TGT_TEXT.vocab.itos)

model = TransformerModel(src_vocab_size, tgt_vocab_size, dropout=0.2).to(device) 
model.load_state_dict(torch.load("./saved_model.pytorch"))
model.eval()

RuntimeError: Error(s) in loading state_dict for TransformerModel:
	size mismatch for src_encoder.weight: copying a param with shape torch.Size([4566, 512]) from checkpoint, the shape in current model is torch.Size([4548, 512]).
	size mismatch for tgt_encoder.weight: copying a param with shape torch.Size([4198, 512]) from checkpoint, the shape in current model is torch.Size([4183, 512]).
	size mismatch for decoder.weight: copying a param with shape torch.Size([4198, 512]) from checkpoint, the shape in current model is torch.Size([4183, 512]).
	size mismatch for decoder.bias: copying a param with shape torch.Size([4198]) from checkpoint, the shape in current model is torch.Size([4183]).

In [60]:
evaluate()

| EVALUATION |     0/   59 batches |
| EVALUATION |    50/   59 batches |
| EVALUATION | BLEU:  0.21 |


### Evaluating one sample

In [32]:
" ".join([SRC_TEXT.vocab.itos[i] for i in [ 2,21,83,13,10, 4, 5, 5,83, 4, 3]])

'<sos> substitute args for self . _ _ args . <eos>'

In [33]:
" ".join([TGT_TEXT.vocab.itos[i] for i in [ 2,12,5,4]])

'<sos> self . _'

In [91]:
def translate(s):
    src_ids = SRC_TEXT.numericalize([["<sos>"] + SRC_TEXT.preprocess(s) + ["<eos>"]], device=device)
#     src_ids = torch.tensor([ [2],[21],[83],[13],[10], [4], [5], [5],[83], [4], [3]], device=device)
    print("SRC ids shape:",src_ids)
    model.eval
    with torch.no_grad():
        sos_id = TGT_TEXT.vocab.stoi["<sos>"]
#         decoder_input = torch.zeros((1, 1), dtype=torch.long, device=device).fill_(sos_id)
        decoder_input = torch.tensor(np.array([ [2]]), device=device)
#         print("Decoder input shape:", decoder_input.shape)
        
        for i in range(10):
#             print("Decoder input", decoder_input)
            output = model(src_ids, decoder_input)
#             print(model.tgt_mask)
            print("output:", output)
#             print("predicted ids:", output.argmax(dim=-1))
            last_pred = output[-1:].argmax(dim=2)
#             decoder_input[i+1][0] = last_pred
#             print("last pred:", TGT_TEXT.vocab.itos[last_pred.cpu().numpy()[0][0]], last_pred.cpu().numpy()[0][0])
            print(TGT_TEXT.vocab.itos[last_pred.cpu().numpy()[0][0]],'', end = '')
            
            decoder_input = torch.cat((decoder_input, last_pred))
#             print("Decoder input", decoder_input)
#             break

translate("append value to results .")

SRC ids shape: tensor([[  2],
        [ 72],
        [ 25],
        [ 17],
        [301],
        [  4],
        [  3]], device='cuda:0')
output: tensor([[[nan, nan, nan,  ..., nan, nan, nan]]], device='cuda:0')
<unk> output: tensor([[[nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan]]], device='cuda:0')
<unk> output: tensor([[[nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan]]], device='cuda:0')
<unk> output: tensor([[[nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan]]], device='cuda:0')
<unk> output: tensor([[[nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan

In [37]:
np.array([torch.tensor([1.0]),torch.tensor([2.0])])

array([1., 2.], dtype=float32)

Moses Multi-BLEU perl script returns 0.0 for any sentence less than 4 tokens long.
It will be best to use a function by NLTK

In [39]:
get_moses_multi_bleu(["this is a test"], ["this is a for"])

0.0