In [1]:
from models_and_trainers.copy_gen_transformer import CopyGeneratorTransformer
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu

import numpy as np
import time
import math
import torch
import torch.nn as nn
import torchtext
from torchtext.data import Field, BucketIterator

from utils.useful_utils import string_split_v3, string_split_v2, string_split_v1, nltk_bleu
from utils.dataset_loaders import SRC_TGT_pairs
import utils.beam_search as beam_search
import tqdm.notebook as tqdm 

from models_and_trainers.trainers import Model_Trainer

from utils.vocab_classes import Shared_Vocab
%load_ext line_profiler
%load_ext autoreload
%autoreload 2

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device == "cuda":
    torch.cuda.set_device(0) # choose GPU from nvidia-smi 
print("Using:", device)

Using: cuda


In [3]:
src_train_fp = "datasets/django_folds/django.fold1-10.train.src"
tgt_train_fp = "datasets/django_folds/django.fold1-10.train.tgt"
src_test_fp = "datasets/django_folds/django.fold1-10.test.src"
tgt_test_fp = "datasets/django_folds/django.fold1-10.test.tgt"

max_seq_len = 50

train_samples = SRC_TGT_pairs(src_train_fp, tgt_train_fp, max_seq_len=max_seq_len).samples
test_samples = SRC_TGT_pairs(src_test_fp, tgt_test_fp, max_seq_len=max_seq_len).samples

In [4]:
vocab_size = 850

In [5]:
vocab = Shared_Vocab(train_samples, vocab_size, string_split_v3, use_OOVs=True)

In [6]:
def data2dataset(data):
    TEXT_FIELD = Field(sequential=True, use_vocab=False, unk_token=0, init_token=1,eos_token=2, pad_token=3)
    OOV_TEXT_FIELD = Field(sequential=True, use_vocab=False, pad_token=3)

    OOV_stoi = {}
    OOV_itos = {}
    OOV_starter_count = 30000
    OOV_count = OOV_starter_count

    examples = []

    for (src, tgt) in data:
        src_ids, OOV_ids = vocab.encode_input(src)
#         print(src, src_ids)
        tgt_ids = vocab.encode_output(tgt, OOV_ids)
#         OOV_ids = []

#         for OOV in OOVs:
#             try:
#                 idx = OOV_stoi[OOV]
#                 OOV_ids.append(idx)
#             except KeyError as e:
#                 OOV_count += 1
#                 OOV_stoi[OOV] = OOV_count
#                 OOV_itos[OOV_count] = OOV
#                 OOV_ids.append(OOV_count)

        examples.append(torchtext.data.Example.fromdict({"src":src_ids, "tgt":tgt_ids, "OOVs":OOV_ids}, 
                                                        fields={"src":("src",TEXT_FIELD), "tgt":("tgt",TEXT_FIELD), "OOVs":("OOVs", OOV_TEXT_FIELD)}))
    dataset = torchtext.data.Dataset(examples,fields={"src":TEXT_FIELD, "tgt":TEXT_FIELD, "OOVs":OOV_TEXT_FIELD})
    return dataset

In [7]:
train_dataset = data2dataset(train_samples)
test_dataset = data2dataset(test_samples)

In [8]:
batch_size = 32

train_iterator = BucketIterator(
    train_dataset,
    batch_size = batch_size,
    repeat=True,
    shuffle=True,
    sort=True,
    sort_key = lambda x: len(x.src)+len(x.tgt),
    device = device)

test_iterator = BucketIterator(
    test_dataset,
    batch_size = batch_size,
    sort=True,
    sort_key = lambda x: len(x.src)+len(x.tgt),
    device = device)

In [9]:
sample = next(iter(train_dataset))
vocab.decode_input(sample.tgt, sample.OOVs)

'self . _request(COPY) = datastructures(COPY) . MergeDict(COPY) ( self . POST(COPY) , self . GET(COPY) )'

In [10]:
for i, batch in enumerate(train_iterator):
    for idx in range(batch_size):
        encoder_input = batch.src.cpu()[:,idx].tolist()
        decoder_input = batch.tgt.cpu()[:,idx].tolist()
        OOVs = batch.OOVs.cpu()[:,idx].tolist()

        print("encoder_input    :",vocab.decode_input(encoder_input, OOVs))
        print("decoder_input    :",vocab.decode_output(decoder_input, OOVs))
        print()
        break
    break

encoder_input    : <sos> try , <eos> <pad>
decoder_input    : <sos> try : <eos>



In [11]:
output_vocab_size = vocab.size+max_seq_len
model = CopyGeneratorTransformer(vocab_size=output_vocab_size, embed_dim=512, att_heads=8, layers=4, dim_feedforward=1024).to(device)

In [12]:
output_vocab_size

900

In [13]:
criterion = nn.CrossEntropyLoss(ignore_index=vocab.stoi['<pad>'])
params = model.parameters()

def train_step(batch):
    total_loss = 0.
    start_time = time.time()
    encoder_input = batch.src
    decoder_input = batch.tgt[:-1]
    targets = batch.tgt[1:]

    optimizer.zero_grad()
    output = model(encoder_input, decoder_input)

    loss = criterion(output.view(-1, output_vocab_size), targets.view(-1))
    loss.backward()
    torch.nn.utils.clip_grad_norm_(params, 0.5)
    optimizer.step()
    elapsed = time.time() - start_time
    return loss

In [14]:
def batch_filter_ids(batch_list):
    SOS_token=vocab.stoi["<sos>"]
    EOS_token=vocab.stoi["<eos>"]
    PAD_token=vocab.stoi["<pad>"]
    return [[id for id in l if id not in [SOS_token,EOS_token,PAD_token]] for l in batch_list]

def evaluate(iterator):
    model.eval() # Turn on the evaluation mode
    with torch.no_grad(), open("BERT_code_generator.out", "w", encoding="utf-8") as out_fp:
        BLEU_scores = []
        pbar = tqdm.tqdm(enumerate(iterator), total=len(iterator))
        for i, batch in pbar:
            batch_size = batch.src.shape[1]
            
            encoder_inputs = batch.src
            predictions = beam_search.beam_search_decode(model,
                              batch_encoder_ids=encoder_inputs,
                              SOS_token=vocab.stoi["<sos>"],
                              EOS_token=vocab.stoi["<eos>"],
                              PAD_token=vocab.stoi["<pad>"],
                              beam_size=1,
                              max_length=max_seq_len,
                              num_out=1)
            
            sources = encoder_inputs.transpose(0,1).cpu().tolist()
            sources = batch_filter_ids(sources)
            
            predictions = [t[0].view(-1).cpu().tolist() for t in predictions]
            predictions = batch_filter_ids(predictions)
            
            targets = batch.tgt.transpose(0,1).cpu().tolist()
            targets = batch_filter_ids(targets)
            
            for j in range(batch_size):
                BLEU = nltk_bleu(targets[j], predictions[j])
                BLEU_scores.append(BLEU)
                OOV_ids = batch.OOVs.cpu()[:,idx].tolist()
                
                out_fp.write("SRC  :" + vocab.decode(sources[j],OOV_ids) + "\n")
                out_fp.write("TGT  :" + vocab.decode(targets[j],OOV_ids) + "\n")
                out_fp.write("PRED :" + vocab.decode(predictions[j],OOV_ids) + "\n")
                out_fp.write("BLEU :" + str(BLEU) + "\n")
                out_fp.write("\n")
            pbar.set_description(f"BLEU:{np.average(BLEU_scores):5.2f}")
        final_BLEU = np.average(BLEU_scores)
        
        out_fp.write("\n\n| EVALUATION | BLEU: {:5.2f} |\n".format(final_BLEU))
        print("| EVALUATION | BLEU: {:5.3f} |".format(final_BLEU))
    return (final_BLEU)

In [15]:
lr = 0.005 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.99)

trainer = Model_Trainer(optimizer, scheduler)

In [None]:
scores = trainer.train(model,train_iterator,train_step,500000,test_iterator=test_iterator,eval_fn=evaluate, eval_interval=10000)

HBox(children=(IntProgress(value=0, max=500000), HTML(value='')))

Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.331 |
Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.399 |
Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.553 |
Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.594 |
Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.646 |
Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.719 |
Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.757 |
Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.783 |
Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.787 |
Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.796 |
Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.806 |
Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.806 |
Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.809 |
Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.809 |
Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.811 |
Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.820 |
Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.818 |
Evaluating model


HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.819 |


## Performance profiling

In [37]:
def func1():
    a = 3
    time.sleep(0.1)
    return a
    
def func2():
    b = "foo"
    c = func1()
    return c

In [38]:
%lprun -f func1 -f func2 func2()

Timer unit: 1e-06 s

Total time: 0.100124 s
File: <ipython-input-37-f0ccff687a43>
Function: func1 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def func1():
     2         1          0.0      0.0      0.0      a = 3
     3         1     100121.0 100121.0    100.0      time.sleep(0.1)
     4         1          3.0      3.0      0.0      return a

Total time: 0.100139 s
File: <ipython-input-37-f0ccff687a43>
Function: func2 at line 6

Line #      Hits         Time  Per Hit   % Time  Line Contents
     6                                           def func2():
     7         1          3.0      3.0      0.0      b = "foo"
     8         1     100135.0 100135.0    100.0      c = func1()
     9         1          1.0      1.0      0.0      return c

In [17]:
%lprun -f model.transformer.forward trainer.train(model,train_iterator,train_step,50) 

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Finished training


Timer unit: 1e-06 s

Total time: 0.529808 s
File: /nfs/phd_by_carlos/notebooks/models_and_trainers/exposed_transformer.py
Function: forward at line 73

Line #      Hits         Time  Per Hit   % Time  Line Contents
    73                                               def forward(self, src, tgt, src_mask=None, tgt_mask=None,
    74                                                           memory_mask=None, src_key_padding_mask=None,
    75                                                           tgt_key_padding_mask=None, memory_key_padding_mask=None):
    76                                                   r"""Take in and process masked source/target sequences.
    77                                           
    78                                                   Args:
    79                                                       src: the sequence to the encoder (required).
    80                                                       tgt: the sequence to the decoder (required).
   

In [56]:
it = iter(train_iterator)

In [71]:
batch = next(it)
batch


[torchtext.data.batch.Batch of size 32]
	[.src]:[torch.cuda.LongTensor of size 28x32 (GPU 0)]
	[.tgt]:[torch.cuda.LongTensor of size 22x32 (GPU 0)]
	[.OOVs]:[torch.cuda.LongTensor of size 2x32 (GPU 0)]

In [52]:
%lprun -f model.forward train_step(batch) 

Timer unit: 1e-06 s

Total time: 0.14072 s
File: /nfs/phd_by_carlos/notebooks/models_and_trainers/copy_gen_transformer.py
Function: forward at line 202

Line #      Hits         Time  Per Hit   % Time  Line Contents
   202                                               def forward(self, src, tgt):
   203         1      49978.0  49978.0     35.5          self.tgt_mask = self._generate_square_subsequent_mask(len(tgt)).to(self.device) if self.masked_look_ahead_att else None
   204                                                   
   205                                           
   206         1        993.0    993.0      0.7          src_emb = self.src_embedder(src) * math.sqrt(self.embedding_size)
   207                                                   
   208         1       1350.0   1350.0      1.0          tgt_emb = self.tgt_embedder(tgt) * math.sqrt(self.embedding_size)
   209                                                   
   210         1      86841.0  86841.0     61.7        

In [24]:
evaluate(test_iterator)

HBox(children=(IntProgress(value=0, max=59), HTML(value='')))


| EVALUATION | BLEU: 0.611 |


0.6105721971123488

In [22]:
vocab.decode([50, 40],[])

'into define'