In [8]:
%load_ext autoreload
%autoreload 2
from utils import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### The original (German, English) dataset download links do not work - replace download links

In [9]:
from torchtext.datasets import Multi30k
from functools import partial
from torchtext.datasets import multi30k

multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"
multi30k.URL["test"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/mmt16_task1_test.tar.gz"

multi30k.MD5["train"] = "20140d013d05dd9a72dfde46478663ba05737ce983f478f960c1123c6671be5e"
multi30k.MD5["valid"] = "a7aa20e9ebd5ba5adce7909498b94410996040857154dab029851af3a866da8c"
multi30k.MD5["test"] = "6d1ca1dba99e2c5dd54cae1226ff11c2551e6ce63527ebb072a1f70f72a5cd36"

# Load HuggingFace Tokenizers

In [10]:
# These (tokenizers and the model) all have global scope
from transformers import  AutoTokenizer,GPT2Tokenizer
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tknzr_de = AutoTokenizer.from_pretrained("dbmdz/german-gpt2")
tknzr_de.add_special_tokens({'pad_token': '[PAD]', 'bos_token': '[BOS]', 'eos_token' : '[EOS]', 'unk_token':'[UNK]'})
#
tknzr_en = GPT2Tokenizer.from_pretrained('gpt2')
tknzr_en.add_special_tokens({'pad_token': '[PAD]', 'bos_token': '[BOS]', 'eos_token' : '[EOS]', 'unk_token':'[UNK]'})
#
padSrc = tknzr_de(tknzr_de.pad_token)['input_ids'][0]
padTgt = tknzr_en(tknzr_en.pad_token)['input_ids'][0]
#
lenSrc = len(tknzr_de)
lenTgt = len(tknzr_en)
#---------
pad_IdDe = torch.tensor(tknzr_de(tknzr_de.pad_token)['input_ids'][0],device = device)
bs_idDe = torch.tensor(tknzr_de(tknzr_de.bos_token)['input_ids'][0], device = device)
eos_idDe = torch.tensor(tknzr_de(tknzr_de.eos_token)['input_ids'][0], device = device)
#
pad_IdEn = torch.tensor(tknzr_en(tknzr_en.pad_token)['input_ids'][0], device = device)
bs_idEn = torch.tensor(tknzr_en(tknzr_en.bos_token)['input_ids'][0], device = device)
eos_idEn = torch.tensor(tknzr_en(tknzr_en.eos_token)['input_ids'][0], device = device)

## Load Transformer model trained with HuggingFace tokenization

In [11]:
modelHF = make_model(lenSrc, lenTgt, N=6, HF = 1)
print("Loading trained Huggingface model")
modelHF.load_state_dict(
    torch.load("multi30k_model_final_HF.pt", map_location=torch.device("cpu"))
)

Loading trained Huggingface model


<All keys matched successfully>

# Load Spacy tokenizers and vocabulary

In [12]:
RUN_EXAMPLES = True
def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)

def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)

# global variables used later in the script
spacy_de, spacy_en = show_example(load_tokenizers)
vocab_src, vocab_tgt = show_example(load_vocab, args=[spacy_de, spacy_en])

Finished.
Vocabulary sizes:
8316
6384


In [13]:
def check_outputs(
    valid_dataloader,
    model,
    vocab_src,
    vocab_tgt, #
    n_examples=15,
    pad_idx=2,
    eos_string="</s>", 
):
    results = [()] * n_examples
    resultsHF = [()]*n_examples
    padSrcHF = tknzr_de(tknzr_de.pad_token)['input_ids'][0]
    padTgtHF = tknzr_en(tknzr_en.pad_token)['input_ids'][0]
    for idx in range(n_examples):
        print("\nExample %d ========\n" % idx)
        b = next(iter(valid_dataloader))
        ###
        # SPACY
        ###
        rb = Batch(b[0], b[1], pad_idx) # change the collate function for valid_dataloader to return b[0],b[1],b[2],b[3]
                                        # b[2],b[3] are the source and target tok2Id for HF model

        src_tokens = [
            vocab_src.get_itos()[x] for x in rb.src[0] if x != pad_idx
        ]
        tgt_tokens = [
            vocab_tgt.get_itos()[x] for x in rb.tgt[0] if x != pad_idx
        ]

        print(
            "Source Spacy Text (Input)        : "
            + " ".join(src_tokens).replace("\n", "")
        )
        print(
            "Target Spacy Text (Ground Truth) : "
            + " ".join(tgt_tokens).replace("\n", "")
        )
        model_out = greedy_decode(model, rb.src, rb.src_mask, 72, 0)[0]
        model_txt = (
            " ".join(
                [vocab_tgt.get_itos()[x] for x in model_out if x != pad_idx]
            ).split(eos_string, 1)[0]
            + eos_string
        )
        print("Model Spacy Output               : " + model_txt.replace("\n", ""))
        results[idx] = (rb, src_tokens, tgt_tokens, model_out, model_txt)
        print("========\n")
        ###
        # HUGGING FACE
        ###
        rb = BatchHF(b[2], b[3], padSrcHF, padTgtHF)
        src_tokens = [tknzr_de.decode(x) for x in rb.src[0] if x!= padSrcHF]
        tgt_tokens = [tknzr_en.decode(x) for x in rb.tgt[0] if x!= padTgtHF]
        print(
            "Source HF Text  (Input)        : "
            + " ".join(src_tokens).replace("\n", "")
        )
        print(
            "Target HF Text  (Ground Truth) : "
            + " ".join(tgt_tokens).replace("\n", "")
        )
        model_out = greedy_decode(modelHF, rb.src, rb.src_mask, 72, start_symbol = tknzr_en(tknzr_en.bos_token)['input_ids'][0])[0]
        model_txt = (
            " ".join(
                [tknzr_en.decode(x) for x in model_out if x != padTgtHF]
            ).split(tknzr_en.eos_token, 1)[0]
            + tknzr_en.eos_token
        )
        print("Model HF Output: " + model_txt.replace("\n", ""))

        resultsHF[idx] = (rb, src_tokens, tgt_tokens, model_out, model_txt)
    return results#,resultsHF


def run_model_example(n_examples=25):
    global vocab_src, vocab_tgt, spacy_de, spacy_en

    print("Preparing Data ...")
    valid_dataloader = create_dataloaders(
        torch.device("cpu"),
        vocab_src,
        vocab_tgt,
        spacy_de,
        spacy_en,
        batch_size=1,
        is_distributed=False,
        tknzr_de=tknzr_de,
        tknzr_en=tknzr_en
    )

    print("Loading Trained Spacy Model ...")
    model = make_model(len(vocab_src), len(vocab_tgt), N=6)
    model.load_state_dict(
        torch.load("multi30k_model_final_Spacy.pt", map_location=torch.device("cpu"))
    )
    print("Comparing Model Outputs:")
    example_data = check_outputs(
        valid_dataloader, model, vocab_src, vocab_tgt, n_examples=n_examples
    )
    return model, example_data

execute_example(run_model_example)

Preparing Data ...
Loading Trained Spacy Model ...
Comparing Model Outputs:


Source Spacy Text (Input)        : <s> Ein Polizist hält ein Fahrzeug am Straßenrand an . </s>
Target Spacy Text (Ground Truth) : <s> A policeman stopping a vehicle on the side of the road . </s>
Model Spacy Output               : <s> A policeman holds a vehicle by the side of the road . </s>
Source Text HF (Input)        : [BOS] Ein  Polizist  hält  ein  Fahrzeug  am  Straßen rand  an . [EOS]
Target Text HF (Ground Truth) : [BOS] A  policeman  stopping  a  vehicle  on  the  side  of  the  road . [EOS]
Model HF Output               : [BOS] A  policeman  is  holding  a  vehicle  along  a  side  of  the  road . [EOS]


Source Spacy Text (Input)        : <s> Ein kleines barfüßiges Mädchen mit einem Helm geht über einen mit <unk> bedeckten umgefallenen Baum . </s>
Target Spacy Text (Ground Truth) : <s> A young barefoot girl wearing a helmet walks down a mossy covered fallen down tree . </s>
Model Spacy Output    