# Copy Transformer
This architecturre involves allowing the transformer model to generate a new token from a pre-existing vocabulary and also copy a word directly from the input.

An inportant note of this architecture is that since words need to be copied from source to target language, it is easiest to have a common vocabulary and thus share token IDs.

As a first step, the common words from the input to the output need to be identified. This is challenging since we want to have a vocabulary limit, allowing frequent tokens to be part of the vocabulary, and infrequent ones common to the input and output have a specific copyable token unique to the sentence.

### Imports

In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from queue import PriorityQueue
import numpy as np
import torchtext
import tqdm
from torchnlp.metrics import get_moses_multi_bleu
from torchtext.data import Field, BucketIterator
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu

import tensorflow as tf
import tensorflow_datasets as tfds
from tokenize import tokenize, untokenize, NUMBER, STRING, NAME, OP
from io import BytesIO

import linecache
import sys
import os
import re
import random
import time
import operator
import collections

from base_transformer import TransformerModel
import beam_search
from IPython.core.debugger import set_trace as tr
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
def super_print(filename):
    '''filename is the file where output will be written'''
    def wrap(func):
        '''func is the function you are "overriding", i.e. wrapping'''
        def wrapped_func(*args,**kwargs):
            '''*args and **kwargs are the arguments supplied 
            to the overridden function'''
            #use with statement to open, write to, and close the file safely
            with open(filename,'a') as outputfile:
                now = datetime.now()
                dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                outputfile.write("[{}] ".format(dt_string))
                outputfile.write(" ".join(str(x) for x in args))
                outputfile.write("\n")
            #now original function executed with its arguments as normal
            return func(*args,**kwargs)
        return wrapped_func
    return wrap

print = super_print('logs-copy-gen.txt')(print)

Setting the device to use: CPU or GPU

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device == "cuda":
    torch.cuda.set_device(0) # choose GPU from nvidia-smi 
print("Using:", device)

Using: cpu


### Helper functions

In [4]:
text = "create variable student_names with string 'foo bar baz'"

def string_split(s):
#     return list(filter(lambda x: x != '' and x != "\n" and not x.isspace(), re.split('(_|\W)', s))) # this will chunk all code properly by plits strings with quotes
#     return list(filter(lambda x: x != '' and x != "\n" and not x.isspace(), re.split('(\\\'.*?\\\'|\\\".*?\\\"|_|\W)', s))) # this keeps the strings intact
    return list(filter(lambda x: x != '' and x != "\n" and not x.isspace(), re.split('(\\\'.*?\\\'|\\\".*?\\\"|\W)', s)))

print(string_split(text))

['create', 'variable', 'student_names', 'with', 'string', "'foo bar baz'"]


In [4]:
def corpus_to_array(src_fp, tgt_fp):
    lines = []
    with open(src_fp, "r") as src_file, open(tgt_fp, "r") as tgt_file:
        for src, tgt in zip(src_file, tgt_file):
            lines.append((src, tgt))
    return lines

In [5]:
def filter_corpus(data, max_seq_length=200, tokenizer=string_split):
    return [(src, tgt) for src, tgt in data if len(string_split(src)) <= max_seq_length and len(string_split(tgt)) <= max_seq_length]

In [6]:
def samples_to_dataset(samples):
    """
    Args:
        samples: [(src_string),(tgt_string)]
        src/tgt_tokenizer: a func that takes a string and returns an array of strings
    """
    examples = []
    TEXT_FIELD = Field(sequential=True, use_vocab=False, init_token='<sos>',eos_token='<eos>')
    
    for sample in samples:
        src_string, tgt_string = sample
        examples.append(torchtext.data.Example.fromdict({"src":src_string, "tgt":tgt_string}, 
                                        fields={"src":("src",TEXT_FIELD), "tgt":("tgt",TEXT_FIELD)}))
        
    dataset = torchtext.data.Dataset(examples,fields={"src":src_field, "tgt":tgt_field})
    return dataset

In [7]:
data = corpus_to_array("datasets/all-fixed.desc", "datasets/all.code")
random.shuffle(data)
print("Max src length:", max([len(string_split(src)) for src, tgt in data]))
print("Max tgt length:", max([len(string_split(tgt)) for src, tgt in data]))

print("Full dataset size:", len(data))
max_seq_length=200
data = filter_corpus(data, max_seq_length=50, tokenizer=string_split)
print("Limited dataset size:", len(data))

Max src length: 557
Max tgt length: 527
Full dataset size: 18805
Limited dataset size: 18632


## Making a shared vocabulary
The idea of the copy generator network is to give the model a chance to copy words from the input to the output, some it might know already, but others might be completly unknown to it.

In [8]:
stoi = {"<unk>":0, "<sos>":1, "<eos>":2, "<pad>":3}
max_vocab = 1000 - len(stoi)

all_toks = []
for (src, tgt) in data:
    all_toks += string_split(src)
    all_toks += string_split(tgt)

most_freq = collections.Counter(all_toks).most_common(max_vocab)

for tok, count in most_freq:
    stoi[tok] = len(stoi)
    
itos = [k for k,v in sorted(stoi.items(), key=lambda kv: kv[1])]

In [55]:
torch.save((stoi, itos), "combined_vocab.vcb")

In [9]:
def encode_input(string):
    OOVs = []
    IDs = []
    words = string_split(string)
    for word in words:
        try:
            id = stoi[word]
            IDs.append(id)
        except KeyError as e:
            # word is OOV
            IDs.append(len(stoi) + len(OOVs))
            OOVs.append(word)
    return IDs, OOVs

In [10]:
encode_input(text)

([646, 1000, 1001, 12, 29, 1002],
 ['variable', 'student_names', "'foo bar baz'"])

In [11]:
def encode_output(string, OOVs):
    IDs = []
    words = string_split(string)
    for word in words:
        try:
            id = stoi[word]
            IDs.append(id)
        except KeyError as e:
            # word is OOV
            try:
                IDs.append(len(stoi) + OOVs.index(word))
            except ValueError as e:
                IDs.append(stoi["<unk>"])
    return IDs

In [12]:
encode_output(text,['variable', 'student_names', "'foo bar baz'"])

[646, 1000, 1001, 12, 29, 1002]

In [30]:
def decode(ids, OOVs):
    extended_itos = itos.copy()
    extended_itos += [OOV+"(COPY)" for OOV in OOVs]
    return " ".join([extended_itos[id] for id in ids if id<len(extended_itos)])

In [14]:
decode([1,639, 1000, 1001, 12, 29, 1002,2], ['variable', 'student_names', "'foo bar baz'"])

"<sos> validate_key variable(COPY) student_names(COPY) with string 'foo bar baz'(COPY) <eos>"

In [15]:
TEXT_FIELD = Field(sequential=True, use_vocab=False, unk_token=0, init_token=1,eos_token=2, pad_token=3)
OOV_TEXT_FIELD = Field(sequential=True, use_vocab=False, pad_token=3)

OOV_stoi = {}
OOV_itos = {}
OOV_starter_count = 30000
OOV_count = OOV_starter_count

examples = []

for (src, tgt) in data:
    src_ids, OOVs = encode_input(src)
    tgt_ids = encode_output(tgt, OOVs)
    OOV_ids = []
    
    for OOV in OOVs:
        try:
            idx = OOV_stoi[OOV]
            OOV_ids.append(idx)
        except KeyError as e:
            OOV_count += 1
            OOV_stoi[OOV] = OOV_count
            OOV_itos[OOV_count] = OOV
            OOV_ids.append(OOV_count)
            
    examples.append(torchtext.data.Example.fromdict({"src":src_ids, "tgt":tgt_ids, "OOVs":OOV_ids}, 
                                                    fields={"src":("src",TEXT_FIELD), "tgt":("tgt",TEXT_FIELD), "OOVs":("OOVs", OOV_TEXT_FIELD)}))

In [16]:
dataset = torchtext.data.Dataset(examples,fields={"src":TEXT_FIELD, "tgt":TEXT_FIELD, "OOVs":OOV_TEXT_FIELD})
train_dataset, val_dataset = dataset.split([0.9,0.1])

In [17]:
batch_size = 16

train_iterator = BucketIterator(
    train_dataset,
    batch_size = batch_size,
    repeat=True,
    shuffle=True,
    sort_key = lambda x: len(x.src)+len(x.tgt),
    device = device)

# The iterator generates batches with padded length for sequences with similar sizes, a batch is [seq_length, batch_size]

for i, batch in enumerate(train_iterator):
    idx = 5
#     print([SRC_TEXT.vocab.itos[id] for id in batch.src.cpu().numpy()[:,idx]])
    OOVs = [OOV_itos[OOV] for OOV in batch.OOVs.cpu()[:,idx].tolist() if OOV != 3] # 3 is the <pad> token
    src_ids = batch.src.cpu()[:,idx].tolist()
    src_ids = src_ids[:src_ids.index(2)+1]
    tgt_ids = batch.tgt.cpu()[:,idx].tolist()
    tgt_ids = tgt_ids[:tgt_ids.index(2)+1]
    
    print("SOURCE:",decode(src_ids, OOVs))
    print()
    print("TARGET:",decode(tgt_ids, OOVs))
    break

SOURCE: <sos> if use_l10n is true , or use_l10n is None and settings . USE_L10N(COPY) is true , <eos>

TARGET: <sos> if use_l10n or ( use_l10n is None and settings . USE_L10N(COPY) ) : <eos>


## Building the model
### A simple model only relying on the input
This model is a simple version of the copy generator network where it will not be able to copy tokens based on the attention given to the input sequence. OOV tokens will be treated as generic tokens to be later substituted. The advantage is that this model can be the same as the previous version with a hard cap on the amount of possible OOV tokens.

In [18]:
vocab_size = len(itos) + 50 # set a maximum cap on the amount of capyable tokens to the max sequence length

model = TransformerModel(vocab_size, vocab_size, dropout=0.2).to(device) 

In [19]:
def nltk_bleu(refrence, prediction):
    """
    Implementation from ReCode
    and moses multi belu script sets BLEU to 0.0 if len(toks) < 4
    """
    ngram_weights = [0.25] * min(4, len(refrence))
    return sentence_bleu([refrence], prediction, weights=ngram_weights, 
                          smoothing_function=SmoothingFunction().method3)

In [28]:
valid_iterator = BucketIterator(val_dataset,
    batch_size = 128,
    sort_key = lambda x: len(x.src)+len(x.tgt),
    device = device)

def batch_filter_ids(batch_list):
    return [[id for id in l if id not in [0,1,2,3]] for l in batch_list]

def evaluate(beam_size=1):
    model.eval() # Turn on the evaluation mode
    with torch.no_grad(), open("out.txt", "w") as out_fp:
        BLEU_scores = []
        for i, batch in enumerate(valid_iterator):
            batch_size = batch.src.shape[1]
            
            encoder_inputs = batch.src
            predictions = beam_search.beam_search_decode(model,
                              batch_encoder_ids=encoder_inputs,
                              SOS_token=stoi["<sos>"],
                              EOS_token=stoi["<eos>"],
                              PAD_token=stoi["<pad>"],
                              beam_size=beam_size,
                              max_length=20,
                              num_out=1)
            
            sources = encoder_inputs.transpose(0,1).cpu().tolist()
            sources = batch_filter_ids(sources)
            
            predictions = [t[0].view(-1).cpu().tolist() for t in predictions]
            predictions = batch_filter_ids(predictions)
            
            targets = batch.tgt.transpose(0,1).cpu().tolist()
            targets = batch_filter_ids(targets)
            
            OOVss = [[OOV_itos[OOV] for OOV in batch.OOVs.cpu()[:,idx].tolist() if OOV != 3] for idx in range(batch_size)]
            
            if i % int(len(valid_iterator)/3) == 0:
                print("| EVALUATION | {:5d}/{:5d} batches |".format(i, len(valid_iterator)))
            
            for j in range(batch_size):
                BLEU = nltk_bleu(targets[j], predictions[j])
                BLEU_scores.append(BLEU)
                
                out_fp.write("SRC  :" + decode(sources[j], OOVss[j]) + "\n")
                out_fp.write("TGT  :" + decode(targets[j], OOVss[j]) + "\n")
                out_fp.write("PRED :" + decode(predictions[j], OOVss[j]) + "\n")
                out_fp.write("BLEU :" + str(BLEU) + "\n")
                out_fp.write("\n")
        out_fp.write("\n\n| EVALUATION | BLEU: {:5.2f} |\n".format(np.average(BLEU_scores)))
        print("| EVALUATION | BLEU: {:5.2f} |".format(np.average(BLEU_scores)))

In [38]:
def train_step(batch):
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    tgt_vocab_size = len(itos) + 50
    encoder_input = batch.src
    decoder_input = batch.tgt[:-1]
    targets = batch.tgt[1:]

    optimizer.zero_grad()
    output = model(encoder_input, decoder_input)

    loss = criterion(output.view(-1, tgt_vocab_size), targets.view(-1))
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()
    elapsed = time.time() - start_time
    return loss

In [39]:
criterion = nn.CrossEntropyLoss(ignore_index=stoi['<pad>'])
lr = 0.005 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.99)

In [43]:
def train(steps=10000, log_interval=200, learning_interval=4000, eval_interval=1000):
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    step = 1
    for batch in train_iterator:
        loss = train_step(batch)
        total_loss += loss.item()
        
        if step % log_interval == 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| {:5d}/{:5d} steps | '
                  'lr {:02.4f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    step, steps, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        
        if step % eval_interval == 0:
            print("Evaluating model")
            evaluate()
            model.train()
        
        if step % learning_interval == 0:
            scheduler.step()
        
        step += 1
        if step >= steps:
            print("Finished training")
            return

train(steps=1000000,eval_interval=1000000,log_interval=50)

|    50/1000000 steps | lr 0.0004 | ms/batch 33.48 | loss  0.59 | ppl     1.80
|   100/1000000 steps | lr 0.0004 | ms/batch 32.49 | loss  0.60 | ppl     1.83
|   150/1000000 steps | lr 0.0004 | ms/batch 31.89 | loss  0.63 | ppl     1.88
|   200/1000000 steps | lr 0.0004 | ms/batch 31.95 | loss  0.64 | ppl     1.89


KeyboardInterrupt: 

In [54]:
torch.save((model, optimizer, scheduler), "./saved_copy_72BLEU_model.pytorch")

## Evaluation and testing

In [48]:
evaluate(beam_size=5)

| EVALUATION |     0/   15 batches |


KeyboardInterrupt: 

In [50]:
sent = "try,"
targ = "try:"

src_ids, OOVs = encode_input(sent)
tgt_ids = encode_output(targ,OOVs)



([57, 5], [])