In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from queue import PriorityQueue
import numpy as np
import torchtext
import tqdm
from torchnlp.metrics import get_moses_multi_bleu
from torchtext.data import Field, BucketIterator
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu

import tensorflow as tf
import tensorflow_datasets as tfds
from tokenize import tokenize, untokenize, NUMBER, STRING, NAME, OP
from io import BytesIO

import linecache
import sys
import os
import re
import random
import time
import operator

from base_transformer import TransformerModel

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device == "cuda":
    torch.cuda.set_device(0) # choose GPU from nvidia-smi 
print("Using:", device)

Using: cuda


In [3]:
!nvidia-smi

Wed Nov 13 12:00:12 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.26       Driver Version: 430.26       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  TITAN RTX           Off  | 00000000:B2:00.0 Off |                  N/A |
| 41%   36C    P8     5W / 280W |     10MiB / 24220MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

## Loading the dataset

In [4]:
try:
    os.mkdir("./datasets")
except FileExistsError:
    print("Directories already exists")

# getting descriptions
!wget https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.anno -O ./datasets/all.desc

# getting code
!wget https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.code -O ./datasets/all.code

Directories already exists
--2019-11-13 12:00:12--  https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.anno
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.16.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.16.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1382085 (1.3M) [text/plain]
Saving to: './datasets/all.desc'


2019-11-13 12:00:12 (15.4 MB/s) - './datasets/all.desc' saved [1382085/1382085]

--2019-11-13 12:00:13--  https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.code
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.16.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.16.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 906732 (885K) [text/plain]
Saving to: './datasets/all.code'


2019-11-13 12:00:13 (11.6 MB/s) - './datasets/all.code' saved [906732

## Creating a token text encoder
An encoder will take a file and a splitting function and return an object able to encode and decode a string. It will also be able to save a vocab file and retrieve from file.

In [5]:
text = " append rel_to to string 'ForeignKey, (substitute the result for field_type.)"

# looks like code split need parenthesis to be matched in the same string, if not it gives an error...
def code_split(s):
    return [x.string for x in tokenize(BytesIO(s.encode('utf-8')).readline) if x.string != '' and x.string != "\n" and not x.string.isspace()][1:]

print(code_split(text))

['append', 'rel_to', 'to', 'string', "'", 'ForeignKey', ',', '(', 'substitute', 'the', 'result', 'for', 'field_type', '.', ')']


In [6]:
text = " append rel_to to string 'ForeignKey, (subs__titute the result' for field_type."

def string_split(s):
    return list(filter(lambda x: x != '' and x != "\n" and not x.isspace(), re.split('(_|\W)', s))) # this will chunk all code properly by plits strings with quotes
#     return list(filter(lambda x: x != '' and x != "\n" and not x.isspace(), re.split('(\\\'.*?\\\'|\\\".*?\\\"|_|\W)', s))) # this keeps the strings intact

print(string_split(text))

['append', 'rel', '_', 'to', 'to', 'string', "'", 'ForeignKey', ',', '(', 'subs', '_', '_', 'titute', 'the', 'result', "'", 'for', 'field', '_', 'type', '.']


## Making the input pipeline

In [7]:
def corpus_to_array(src_fp, tgt_fp):
    lines = []
    with open(src_fp, "r") as src_file, open(tgt_fp, "r") as tgt_file:
        for src, tgt in zip(src_file, tgt_file):
            lines.append((src, tgt))
    return lines

In [8]:
def filter_corpus(data, max_seq_length=200, tokenizer=string_split):
    return [(src, tgt) for src, tgt in data if len(string_split(src)) <= max_seq_length and len(string_split(tgt)) <= max_seq_length]

In [9]:
def samples_to_dataset(samples, src_field, tgt_field):
    """
    Args:
        samples: [(src_string),(tgt_string)]
        src/tgt_tokenizer: a func that takes a string and returns an array of strings
    """
    examples = []
    
    for sample in samples:
        src_string, tgt_string = sample
        examples.append(torchtext.data.Example.fromdict({"src":src_string, "tgt":tgt_string}, 
                                        fields={"src":("src",src_field), "tgt":("tgt",tgt_field)}))
        
    dataset = torchtext.data.Dataset(examples,fields={"src":src_field, "tgt":tgt_field})
    return dataset

In [10]:
data = corpus_to_array("datasets/all.desc", "datasets/all.code")
random.shuffle(data)

In [11]:
print("Max src length:", max([len(string_split(src)) for src, tgt in data]))
print("Max tgt length:", max([len(string_split(tgt)) for src, tgt in data]))

Max src length: 586
Max tgt length: 1087


In [12]:
print("Full dataset size:", len(data))
max_seq_length=200
data = filter_corpus(data, max_seq_length=200, tokenizer=string_split)
print("Limited dataset size:", len(data))

Full dataset size: 18805
Limited dataset size: 18781


In [13]:
try:
    SRC_TEXT = torch.load("./src_vocab.vcb")
except:
    SRC_TEXT = Field(sequential=True, tokenize=string_split, init_token='<sos>',eos_token='<eos>')

try:
    TGT_TEXT = torch.load("./tgt_vocab.vcb")
except:
    TGT_TEXT = Field(sequential=True, tokenize=string_split, init_token='<sos>',eos_token='<eos>')

dataset = samples_to_dataset(data, SRC_TEXT, TGT_TEXT)

train_dataset, val_dataset = dataset.split([0.9,0.1])

In [14]:
if not hasattr(SRC_TEXT, "vocab"):
    print("creating src vocab")
    SRC_TEXT.build_vocab(train_dataset)
if not hasattr(TGT_TEXT, "vocab"):
    print("creating tgt vocab")
    TGT_TEXT.build_vocab(train_dataset)


sample = dataset[2].src
for tok, id in zip(sample, SRC_TEXT.numericalize([sample])):
    print("{} -> {}".format(tok, id.numpy()[0]))

call -> 19
the -> 7
method -> 16
operator -> 764
. -> 4
attrgetter -> 1799
with -> 9
an -> 12
argument -> 20
_ -> 5
func -> 142
_ -> 5
code -> 141
, -> 6
substitute -> 21
the -> 7
result -> 22
for -> 13
get -> 50
_ -> 5
method -> 16
_ -> 5
code -> 141
. -> 4


## Creating the dataset iterator
This will create a finction returning a different batch. The `train_iterator` is infinitely repeating. while the validation one is not.

In [15]:
batch_size = 32

train_iterator = BucketIterator(
    train_dataset,
    batch_size = batch_size,
    repeat=True,
#     shuffle=True,
    sort_key = lambda x: len(x.src)+len(x.tgt),
    device = device)

valid_iterator = BucketIterator(val_dataset,
    batch_size = batch_size,
    sort_key = lambda x: len(x.src)+len(x.tgt),
    device = device)

# The iterator generates batches with padded length for sequences with similar sizes, a batch is [seq_length, batch_size]

for i, batch in enumerate(train_iterator):
    idx = 0
    print([SRC_TEXT.vocab.itos[id] for id in batch.src.cpu().numpy()[:,idx]])
    print(batch.src.cpu().numpy()[:,idx])
    print(batch.tgt.cpu().numpy()[:,idx])
    break

['<sos>', 'if', 'format', 'is', 'not', 'contained', 'in', '_', 'serializers', ',', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
[  2  15 108  11  39 106  38   5 416   6   3   1   1   1   1   1   1   1
   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
   1   1   1   1   1   1   1   1   1   1]
[  2  14  81  29  24   4 326  11   3   1   1   1   1   1   1   1   1   1
   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1]


Sample transformer without positional encoding, it uses the built in transformer model

In [16]:
rand_transformer_model = nn.Transformer() # uses default hyperparameters
src = torch.rand((10, 32, 512)) # [src_seq_length, batch_size, embedding_size]
tgt = torch.rand((20, 32, 512)) # [tgt_seq_length, batch_size, embedding_size]
rand_transformer_model(src, tgt).shape # [tgt_seq_length, batch_size, embedding_size]

torch.Size([20, 32, 512])

## Building the model


In [113]:
src_vocab_size = len(SRC_TEXT.vocab.itos)
tgt_vocab_size = len(TGT_TEXT.vocab.itos)

model = TransformerModel(src_vocab_size, tgt_vocab_size, dropout=0.2).to(device) 

In [18]:
def greedy_decode_batch_ids(encoder_input, max_seq_length=50):
    batch_len = encoder_input.shape[1]
    sos_id = TGT_TEXT.vocab.stoi["<sos>"]
    decoder_input = torch.zeros((1, batch_len), dtype=torch.long, device=device).fill_(sos_id)

    for i in range(max_seq_length):
        output = model(encoder_input, decoder_input)
        last_pred = output[-1:].argmax(dim=2)

        decoder_input = torch.cat((decoder_input, last_pred))
    return decoder_input

In [21]:
%load_ext line_profiler

In [22]:
# %lprun -f beam_decode beam_decode(model, batch_size=1, encoder_states=src_ids)

In [263]:
s = "for every log in existing ,"
sent1 = ["<sos>"] + SRC_TEXT.preprocess(s) + ["<eos>"]
src_ids = SRC_TEXT.numericalize([sent1], device=device)

decode_ids = SRC_TEXT.numericalize([['<sos>', 'self', '.', 'name']], device=device)

output = model(src_ids, decode_ids)
print(output)
print([TGT_TEXT.vocab.itos[f] for f in output.argmax(dim=-1).view(-1)])

tensor([[[ 3.1062,  0.5920, -1.4175,  ...,  0.2437,  0.2524, -1.2225]],

        [[ 2.4309,  0.8538, -1.8844,  ...,  0.6236, -0.1974, -1.6820]],

        [[ 2.7053,  0.8010, -0.7450,  ...,  1.4385, -0.7941, -1.2559]],

        [[ 1.7970,  0.0554, -0.6959,  ...,  0.8729,  0.0657, -1.1797]]],
       device='cuda:0', grad_fn=<AddBackward0>)
['def', 'self', 'raise', '(']


In [60]:
output = greedy_decode_batch_ids(src_ids, max_seq_length=20)
print(output)
print([TGT_TEXT.vocab.itos[f] for f in output.view(-1)])

tensor([[ 2,  2],
        [17, 17],
        [10, 10],
        [12, 12],
        [ 5,  5],
        [ 4,  4],
        [20, 20],
        [ 6,  6],
        [12, 12],
        [ 7,  8],
        [ 3, 25],
        [10,  8],
        [ 9, 25],
        [ 7,  7],
        [ 3, 11],
        [ 7,  3],
        [ 3,  8],
        [ 3, 25],
        [ 3,  7],
        [ 3, 11],
        [ 3,  3]], device='cuda:0')
['<sos>', '<sos>', 'def', 'def', '=', '=', 'self', 'self', '.', '.', '_', '_', 'name', 'name', '(', '(', 'self', 'self', ')', ',', '<eos>', '*', '=', ',', "'", '*', ')', ')', '<eos>', ':', ')', '<eos>', '<eos>', ',', '<eos>', '*', '<eos>', ')', '<eos>', ':', '<eos>', '<eos>']


In [48]:
%lprun -f beam_search.beam_search_decode beam_search.beam_search_decode(model,TGT_TEXT, batch_encoder_ids=src_ids, SOS_token=SOS_token, EOS_token=EOS_token, PAD_token=PAD_token, beam_size=20, max_length=20, num_out=1)

FOUND, -8.140298843383789
FOUND, -8.911083221435547


Timer unit: 1e-06 s

Total time: 4.829 s
File: /nfs/phd_by_carlos/notebooks/beam_search.py
Function: beam_search_decode at line 44

Line #      Hits         Time  Per Hit   % Time  Line Contents
    44                                           def beam_search_decode(model,TGT_TEXT, batch_encoder_ids, beam_size=3, num_out=3, max_length=10, SOS_token=1,EOS_token=2, PAD_token=3):
    45                                               '''
    46                                               :param target_tensor: target indexes tensor of shape [B, T] where B is the batch size and T is the maximum length of the output sentence
    47                                               :param decoder_hidden: input tensor of shape [1, B, H] for start of the decoding
    48                                               :param encoder_outputs: if you are using attention mechanism you can pass encoder outputs, [T, B, H] where T is the maximum length of input sentence
    49                               

In [83]:
import beam_search
import importlib
importlib.reload(beam_search)

sent1 = ["<sos>"] + SRC_TEXT.preprocess("try,") + ["<eos>"] + ["<pad>"]
sent2 = ["<sos>"] + SRC_TEXT.preprocess("for every log in existing ,") + ["<eos>"]
src_ids = SRC_TEXT.numericalize([sent1], device=device)

SOS_token = TGT_TEXT.vocab.stoi["<sos>"]
EOS_token = TGT_TEXT.vocab.stoi["<eos>"]
PAD_token = TGT_TEXT.vocab.stoi["<pad>"]

outputs = beam_search.beam_search_decode(model,TGT_TEXT,
                              batch_encoder_ids=src_ids,
                              SOS_token=SOS_token,
                              EOS_token=EOS_token,
                              PAD_token=PAD_token,
                              beam_size=4,
                              max_length=20,
                              num_out=1)

print(outputs)
print([t[0].view(-1).cpu().tolist() for t in outputs])

for out in outputs:
    for sent in out:
        print([TGT_TEXT.vocab.itos[id] for id in sent.view(-1).cpu().tolist()])
    print()

[[tensor([[ 2],
        [50],
        [11],
        [ 3]], device='cuda:0')]]
[[2, 50, 11, 3]]
['<sos>', 'try', ':', '<eos>']



In [75]:
def nltk_bleu(refrence, prediction):
    """
    Implementation from ReCode
    and moses multi belu script sets BLEU to 0.0 if len(toks) < 4
    """
    ngram_weights = [0.25] * min(4, len(refrence))
    return sentence_bleu([refrence], prediction, weights=ngram_weights, 
                          smoothing_function=SmoothingFunction().method3)

nltk_bleu(np.array([1,2,3,4,5,6]), np.array([1,2,5,6]))

0.2740311596835683

In [77]:
def evaluate(beam_size=1):
    model.eval() # Turn on the evaluation mode
    total_loss = 0.
    with torch.no_grad():
        sources = []
        results = []
        targets = []
        BLEU_scores = []
        for i, batch in enumerate(valid_iterator):
            encoder_inputs = batch.src
            target = batch.tgt
            
            predictions = beam_search.beam_search_decode(model,TGT_TEXT,
                              batch_encoder_ids=encoder_inputs,
                              SOS_token=TGT_TEXT.vocab.stoi["<sos>"],
                              EOS_token=TGT_TEXT.vocab.stoi["<eos>"],
                              PAD_token=TGT_TEXT.vocab.stoi["<pad>"],
                              beam_size=beam_size,
                              max_length=20,
                              num_out=1)
            results += [t[0].view(-1).cpu().tolist() for t in predictions]
#             if beam_size == 1:
#                 predictions = greedy_decode_batch_ids(encoder_inputs, max_seq_length=20)
#                 results += predictions.transpose(0,1).cpu().tolist()
#             else:
#                 predictions = beam_decode(model, encoder_inputs)
#                 results += [sent[0] for sent in predictions]
            
            sources += encoder_inputs.transpose(0,1).cpu().tolist()
            targets += target.transpose(0,1).cpu().tolist()
            if i % 20 == 0:
                print("| EVALUATION | {:5d}/{:5d} batches |".format(i, len(valid_iterator)))
        
        for r_ids, target in zip(results, targets):
            eos_id = TGT_TEXT.vocab.stoi["<eos>"]
            eos_index = r_ids.index(eos_id) if eos_id in r_ids else None
            cut_ids = r_ids[:eos_index]
            filtered_ids = [id for id in cut_ids if id not in [0,1,2,3]]
            filtered_target_ids = [id for id in target if id not in [0,1,2,3]]
            BLEU_scores.append(nltk_bleu(filtered_target_ids, filtered_ids))
        
        with open("out.txt", "w") as out_fp:
            for source, result, target, BLEU in zip(sources, results, targets, BLEU_scores):
                eos_id = TGT_TEXT.vocab.stoi["<eos>"]
                eos_index = result.index(eos_id) if eos_id in result else None
                cut_ids = result[:eos_index]
                filtered_ids = [id for id in cut_ids if id not in [0,1,2,3]]
                filtered_target_ids = [id for id in target if id not in [0,1,2,3]]
                filtered_source_ids = [id for id in source if id not in [0,1,2,3]]
                
                out_fp.write("SRC  :" + " ".join([SRC_TEXT.vocab.itos[id] for id in filtered_source_ids]) + "\n")
                out_fp.write("TGT  :" + " ".join([TGT_TEXT.vocab.itos[id] for id in filtered_target_ids]) + "\n")
                out_fp.write("PRED :" + " ".join([TGT_TEXT.vocab.itos[id] for id in filtered_ids]) + "\n")
                out_fp.write("BLEU :" + str(BLEU) + "\n")
                out_fp.write("\n")
            out_fp.write("\n\n| EVALUATION | BLEU: {:5.2f} |\n".format(np.average(BLEU_scores)))
                
        print("| EVALUATION | BLEU: {:5.2f} |".format(np.average(BLEU_scores)))
        

In [110]:
def train_step(batch):
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    tgt_vocab_size = len(TGT_TEXT.vocab.itos)
    encoder_input = batch.src
    decoder_input = batch.tgt[:-1]
    targets = batch.tgt[1:]

    optimizer.zero_grad()
    output = model(encoder_input, decoder_input)

    loss = criterion(output.view(-1, tgt_vocab_size), targets.view(-1))
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()
    elapsed = time.time() - start_time
    return loss

In [111]:
criterion = nn.CrossEntropyLoss(ignore_index=TGT_TEXT.vocab.stoi['<pad>'])
lr = 0.005 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.99)

In [None]:
def train(steps=10000, log_interval=200, learning_interval=4000, eval_interval=1000):
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    step = 1
    for batch in train_iterator:
        loss = train_step(batch)
        total_loss += loss.item()
        
        if step % log_interval == 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| {:5d}/{:5d} steps | '
                  'lr {:02.4f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    step, steps, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        
        if step % eval_interval == 0:
            print("Evaluating model")
            evaluate()
            model.train()
        
        if step % learning_interval == 0:
            scheduler.step()
        
        step += 1
        if step >= steps:
            print("Finished training")
            return

train(steps=1000000,eval_interval=1000000,log_interval=200)

|   200/1000000 steps | lr 0.0050 | ms/batch 41.16 | loss  5.40 | ppl   222.04
|   400/1000000 steps | lr 0.0050 | ms/batch 41.97 | loss  5.32 | ppl   203.64
|   600/1000000 steps | lr 0.0050 | ms/batch 41.82 | loss  5.24 | ppl   188.33
|   800/1000000 steps | lr 0.0050 | ms/batch 43.00 | loss  5.17 | ppl   175.42
|  1000/1000000 steps | lr 0.0050 | ms/batch 41.55 | loss  5.09 | ppl   162.05
|  1200/1000000 steps | lr 0.0050 | ms/batch 41.10 | loss  5.04 | ppl   153.72
|  1400/1000000 steps | lr 0.0050 | ms/batch 42.02 | loss  4.98 | ppl   145.27
|  1600/1000000 steps | lr 0.0050 | ms/batch 44.35 | loss  4.92 | ppl   137.28
|  1800/1000000 steps | lr 0.0050 | ms/batch 43.12 | loss  4.88 | ppl   132.25
|  2000/1000000 steps | lr 0.0050 | ms/batch 43.36 | loss  4.83 | ppl   124.93
|  2200/1000000 steps | lr 0.0050 | ms/batch 39.92 | loss  4.77 | ppl   117.64
|  2400/1000000 steps | lr 0.0050 | ms/batch 43.29 | loss  4.75 | ppl   115.23
|  2600/1000000 steps | lr 0.0050 | ms/batch 41.98 |



|  4200/1000000 steps | lr 0.0050 | ms/batch 42.66 | loss  4.45 | ppl    85.24
|  4400/1000000 steps | lr 0.0050 | ms/batch 42.49 | loss  4.40 | ppl    81.18
|  4600/1000000 steps | lr 0.0050 | ms/batch 42.43 | loss  4.37 | ppl    78.90
|  4800/1000000 steps | lr 0.0050 | ms/batch 44.00 | loss  4.37 | ppl    78.79
|  5000/1000000 steps | lr 0.0050 | ms/batch 41.27 | loss  4.32 | ppl    74.82
|  5200/1000000 steps | lr 0.0050 | ms/batch 43.67 | loss  4.31 | ppl    74.57
|  5400/1000000 steps | lr 0.0050 | ms/batch 43.12 | loss  4.29 | ppl    73.20
|  5600/1000000 steps | lr 0.0050 | ms/batch 42.30 | loss  4.27 | ppl    71.47
|  5800/1000000 steps | lr 0.0050 | ms/batch 43.55 | loss  4.23 | ppl    68.62
|  6000/1000000 steps | lr 0.0050 | ms/batch 42.09 | loss  4.20 | ppl    66.81
|  6200/1000000 steps | lr 0.0050 | ms/batch 44.15 | loss  4.20 | ppl    66.62
|  6400/1000000 steps | lr 0.0050 | ms/batch 44.26 | loss  4.19 | ppl    66.01
|  6600/1000000 steps | lr 0.0050 | ms/batch 43.40 |

In [108]:
torch.save((model, optimizer, scheduler), "./saved_model.pytorch")

In [114]:
src_vocab_size = len(SRC_TEXT.vocab.itos)
tgt_vocab_size = len(TGT_TEXT.vocab.itos)

# model = TransformerModel(src_vocab_size, tgt_vocab_size, dropout=0.2).to(device) 
# model.load_state_dict(torch.load("./saved_model.pytorch"))
(model, optimizer, scheduler) = torch.load("./saved_model.pytorch")
# model.eval()

In [88]:
evaluate(beam_size=1)

| EVALUATION |     0/   59 batches |
| EVALUATION |    20/   59 batches |
| EVALUATION |    40/   59 batches |
| EVALUATION | BLEU:  0.32 |


### Evaluating one sample

In [32]:
" ".join([SRC_TEXT.vocab.itos[i] for i in [ 2,21,83,13,10, 4, 5, 5,83, 4, 3]])

'<sos> substitute args for self . _ _ args . <eos>'

In [33]:
" ".join([TGT_TEXT.vocab.itos[i] for i in [ 2,12,5,4]])

'<sos> self . _'

In [117]:
def translate(s):
    src_ids = SRC_TEXT.numericalize([["<sos>"] + SRC_TEXT.preprocess(s) + ["<eos>"]], device=device)
#     src_ids = torch.tensor([ [2],[21],[83],[13],[10], [4], [5], [5],[83], [4], [3]], device=device)
    print("SRC ids shape:",src_ids)
    model.eval()
    with torch.no_grad():
        sos_id = TGT_TEXT.vocab.stoi["<sos>"]
#         decoder_input = torch.zeros((1, 1), dtype=torch.long, device=device).fill_(sos_id)
        decoder_input = torch.tensor(np.array([ [2]]), device=device)
#         print("Decoder input shape:", decoder_input.shape)
        
        for i in range(10):
#             print("Decoder input", decoder_input)
            output = model(src_ids, decoder_input)
#             print(model.tgt_mask)
#             print("output:", output)
#             print("predicted ids:", output.argmax(dim=-1))
            last_pred = output[-1:].argmax(dim=2)
#             decoder_input[i+1][0] = last_pred
#             print("last pred:", TGT_TEXT.vocab.itos[last_pred.cpu().numpy()[0][0]], last_pred.cpu().numpy()[0][0])
            print(TGT_TEXT.vocab.itos[last_pred.cpu().numpy()[0][0]],'', end = '')
            
            decoder_input = torch.cat((decoder_input, last_pred))
#             print("Decoder input", decoder_input)
#             break

translate("if PY3 is true ,")

SRC ids shape: tensor([[  2],
        [ 15],
        [533],
        [ 11],
        [ 53],
        [  6],
        [  3]], device='cuda:0')
if if if = None , name , name , 

In [37]:
np.array([torch.tensor([1.0]),torch.tensor([2.0])])

array([1., 2.], dtype=float32)

Moses Multi-BLEU perl script returns 0.0 for any sentence less than 4 tokens long.
It will be best to use a function by NLTK

In [39]:
get_moses_multi_bleu(["this is a test"], ["this is a for"])

0.0