In [3]:
%env HF_DATASETS_CACHE /Volumes/Workspace/.cache/huggingface/datasets

from datasets import concatenate_datasets, load_dataset, load_from_disk, DatasetDict
from transformers import (AutoConfig, AutoTokenizer, DataCollatorForSeq2Seq,
                          EncoderDecoderConfig, EncoderDecoderModel,
                          HfArgumentParser, PfeifferConfig, Seq2SeqTrainer,
                          Seq2SeqTrainingArguments, default_data_collator,
                          set_seed)
import random


env: HF_DATASETS_CACHE=/Volumes/Workspace/.cache/huggingface/datasets


In [4]:
iwslt_dataset = load_dataset("../dataset/iwslt14/iwslt_loader.py", "de-en", data_dir="../dataset/iwslt14")
enc_tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-german-dbmdz-uncased")
dec_tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-uncased")

Using custom data configuration de-en-data_dir=..%2Fdataset%2Fiwslt14
Reusing dataset iwslt217 (/Volumes/Workspace/.cache/huggingface/datasets/iwslt217/de-en-data_dir=..%2Fdataset%2Fiwslt14/1.0.0/dfd22eb448d0ab29dc9f514ee237642604aa514fb962fd35c4da74a9f7e13bac)
100%|██████████| 3/3 [00:00<00:00, 631.04it/s]
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 9.26kB/s]
Downloading: 100%|██████████| 242k/242k [00:00<00:00, 616kB/s] 
Downloading: 100%|██████████| 475k/475k [00:00<00:00, 964kB/s] 
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 14.9kB/s]
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 569kB/s] 
Downloading: 100%|██████████| 455k/455k [00:00<00:00, 920kB/s] 


In [5]:

def preprocess_function(examples):
    inputs = [ex["de"] for ex in examples["translation"]]
    targets = [ex["en"] for ex in examples["translation"]]
    inputs = ["" + inp for inp in inputs]
    model_inputs = enc_tokenizer(
        inputs, max_length=512, padding=True, truncation=True
    )

    # Setup the tokenizer for targets
    # with tokenizer.as_target_tokenizer():
    labels = dec_tokenizer(
        targets, max_length=128, padding=True, truncation=True
    )

    model_inputs["decoder_input_ids"] = labels.input_ids
    model_inputs["decoder_attention_mask"] = labels.attention_mask

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    labels["input_ids"] = [
        [(l if l != dec_tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

column_names = iwslt_dataset["test"].column_names
predict_dataset = iwslt_dataset["test"]
predict_dataset = predict_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=8,
    remove_columns=column_names,
    desc="Running tokenizer on prediction dataset",
)
predict_dataset.set_format(
    type="torch",
    columns=[
        "input_ids",
        "attention_mask",
        "decoder_input_ids",
        "decoder_attention_mask",
        "labels",
    ],
)

Running tokenizer on prediction dataset #0:   0%|          | 0/1 [00:00<?, ?ba/s]
[A

[A[A


[A[A[A



[A[A[A[A




[A[A[A[A[A





Running tokenizer on prediction dataset #1: 100%|██████████| 1/1 [00:01<00:00,  1.76s/ba]


Running tokenizer on prediction dataset #3: 100%|██████████| 1/1 [00:01<00:00,  1.85s/ba]





Running tokenizer on prediction dataset #6: 100%|██████████| 1/1 [00:01<00:00,  1.83s/ba]

Running tokenizer on prediction dataset #2: 100%|██████████| 1/1 [00:01<00:00,  1.93s/ba]




Running tokenizer on prediction dataset #5: 100%|██████████| 1/1 [00:01<00:00,  1.92s/ba]






Running tokenizer on prediction dataset #7: 100%|██████████| 1/1 [00:01<00:00,  1.92s/ba]



Running tokenizer on prediction dataset #4: 100%|██████████| 1/1 [00:02<00:00,  2.11s/ba]
Running tokenizer on prediction dataset #0: 100%|██████████| 1/1 [00:02<00:00,  2.23s/ba]


In [9]:
enc_tokenizer.batch_decode(
    predict_dataset["input_ids"],
    skip_special_tokens=True,
    clean_up_tokenization_spaces=True,
)

['wissen sie, eines der großen vernugen beim reisen und eine der freuden bei der ethnographischen forschung ist, gemeinsam mit den menschen zu leben, die sich noch an die alten tage erinnern konnen. die ihre vergangenheit noch immer im wind spuren, sie auf vom regen geglatteten steinen beruhren, sie in den bitteren blattern der pflanzen schmecken.',
 'einfach das wissen, dass jaguar - schamanen noch immer jenseits der milchstraße reisen oder die bedeutung der mythen der altesten der inuit noch voller bedeutung sind, oder dass im himalaya die buddhisten noch immer den atem des dharma verfolgen, bedeutet, sich die zentrale offenbarung der anthropologie ins gedachtnis zu rufen, das ist der gedanke, dass die welt, in der wir leben, nicht in einem absoluten sinn existiert, sondern nur als ein modell der realitat, als eine folge einer gruppe von bestimmten moglichkeiten der anpassung die unsere ahnen, wenngleich erfolgreich, vor vielen generationen wahlten.',
 'und naturlich teilen wir alle 

In [20]:
val_ds = iwslt_dataset["validation"].map(lambda examples: {"text": [ex["en"] for ex in examples["translation"]]}, batched=True, remove_columns=["translation"])
val_ds

100%|██████████| 8/8 [00:00<00:00, 107.33ba/s]


Dataset({
    features: ['text'],
    num_rows: 7283
})

In [23]:
wmt_dataset = load_dataset("wmt19", "de-en", split="train[:100]")

Reusing dataset wmt19 (/Volumes/Workspace/.cache/huggingface/datasets/wmt19/de-en/1.0.0/fae232cf0c13b62b26731bafce0810bc652fb5799189790bed836db0cee28056)


In [25]:
concat_ds = concatenate_datasets([iwslt_dataset["train"], wmt_dataset])
normalized_ds = concat_ds.map(
    lambda examples: {
        "translation": [{"en": ex["en"], "de": ex["de"]} for ex in examples["translation"]]
    },
    batched=True,
    num_proc=2
)
for datum in normalized_ds["translation"]:
    print(datum)

 #0: 100%|██████████| 81/81 [00:02<00:00, 29.88ba/s]
 #1: 100%|██████████| 81/81 [00:02<00:00, 29.17ba/s]


In [10]:
concat_ds = concatenate_datasets([iwslt_dataset["train"], wmt_dataset])
dataset = DatasetDict()
dataset["train"] = concat_ds
dataset["validation"] = iwslt_dataset["validation"]
dataset["test"] = iwslt_dataset["test"]
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 160339
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 7283
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 6750
    })
})

In [18]:
dataset.save_to_disk("../dataset/sample")

In [19]:
ds = load_from_disk("../dataset/sample")
# en_ds = ds.map(lambda examples: {"text": [ex["en"] for ex in examples["translation"]]}, batched=True, remove_columns=["translation"])
# en_ds
ds

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 160339
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 7283
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 6750
    })
})

In [38]:
en_ds["text"]

['and it can be a very complicated thing , what human health is .',
 'and bringing those two together might seem a very daunting task , but what i &apos;m going to try to say is that even in that complexity , there &apos;s some simple themes that i think , if we understand , we can really move forward .',
 'and those simple themes aren &apos;t really themes about the complex science of what &apos;s going on , but things that we all pretty well know .',
 'and i &apos;m going to start with this one : if momma ain &apos;t happy , ain &apos;t nobody happy .',
 'we know that , right ? we &apos;ve experienced that .',
 'and if we just take that and we build from there , then we can go to the next step , which is that if the ocean ain &apos;t happy , ain &apos;t nobody happy .',
 'that &apos;s the theme of my talk .',
 'and we &apos;re making the ocean pretty unhappy in a lot of different ways .',
 'this is a shot of cannery row in 1932 .',
 'cannery row , at the time , had the biggest indust

In [20]:
de_data = []
en_data = []
for data in iwslt_dataset["train"]["translation"]:
    de_data.append(data["de"].lower())
    en_data.append(data["en"].lower())

KeyError: "Column train not in the dataset. Current columns in the dataset: ['translation']"

In [24]:
total_data = 500_000
random_idx = random.sample(range(0, 38_690_334), total_data - 160_239)

for idx, data in enumerate(wmt_dataset["train"]["translation"]):
    if idx in random_idx:
        de_data.append(data["de"])
        en_data.append(data["en"])

In [None]:
len(de_data), len(en_data)