In [21]:
%env HF_DATASETS_CACHE /Volumes/Workspace/.cache/huggingface/datasets

from datasets import concatenate_datasets, load_dataset, load_from_disk, DatasetDict
from tokenizers import BertWordPieceTokenizer
import random


env: HF_DATASETS_CACHE=/Volumes/Workspace/.cache/huggingface/datasets


In [22]:
iwslt_dataset = load_dataset("../dataset/iwslt14/iwslt_loader.py", "de-en", data_dir="../dataset/iwslt14")
iwslt_dataset

Using custom data configuration de-en-data_dir=..%2Fdataset%2Fiwslt14
Reusing dataset iwslt217 (/Volumes/Workspace/.cache/huggingface/datasets/iwslt217/de-en-data_dir=..%2Fdataset%2Fiwslt14/1.0.0/dfd22eb448d0ab29dc9f514ee237642604aa514fb962fd35c4da74a9f7e13bac)
100%|██████████| 3/3 [00:00<00:00, 26.97it/s]


DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 160239
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 6750
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 7283
    })
})

In [20]:
val_ds = iwslt_dataset["validation"].map(lambda examples: {"text": [ex["en"] for ex in examples["translation"]]}, batched=True, remove_columns=["translation"])
val_ds

100%|██████████| 8/8 [00:00<00:00, 107.33ba/s]


Dataset({
    features: ['text'],
    num_rows: 7283
})

In [23]:
wmt_dataset = load_dataset("wmt19", "de-en", split="train[:100]")

Reusing dataset wmt19 (/Volumes/Workspace/.cache/huggingface/datasets/wmt19/de-en/1.0.0/fae232cf0c13b62b26731bafce0810bc652fb5799189790bed836db0cee28056)


In [25]:
concat_ds = concatenate_datasets([iwslt_dataset["train"], wmt_dataset])
normalized_ds = concat_ds.map(
    lambda examples: {
        "translation": [{"en": ex["en"], "de": ex["de"]} for ex in examples["translation"]]
    },
    batched=True,
    num_proc=2
)
for datum in normalized_ds["translation"]:
    print(datum)

 #0: 100%|██████████| 81/81 [00:02<00:00, 29.88ba/s]
 #1: 100%|██████████| 81/81 [00:02<00:00, 29.17ba/s]


In [10]:
concat_ds = concatenate_datasets([iwslt_dataset["train"], wmt_dataset])
dataset = DatasetDict()
dataset["train"] = concat_ds
dataset["validation"] = iwslt_dataset["validation"]
dataset["test"] = iwslt_dataset["test"]
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 160339
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 7283
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 6750
    })
})

In [18]:
dataset.save_to_disk("../dataset/sample")

In [19]:
ds = load_from_disk("../dataset/sample")
# en_ds = ds.map(lambda examples: {"text": [ex["en"] for ex in examples["translation"]]}, batched=True, remove_columns=["translation"])
# en_ds
ds

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 160339
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 7283
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 6750
    })
})

In [38]:
en_ds["text"]

['and it can be a very complicated thing , what human health is .',
 'and bringing those two together might seem a very daunting task , but what i &apos;m going to try to say is that even in that complexity , there &apos;s some simple themes that i think , if we understand , we can really move forward .',
 'and those simple themes aren &apos;t really themes about the complex science of what &apos;s going on , but things that we all pretty well know .',
 'and i &apos;m going to start with this one : if momma ain &apos;t happy , ain &apos;t nobody happy .',
 'we know that , right ? we &apos;ve experienced that .',
 'and if we just take that and we build from there , then we can go to the next step , which is that if the ocean ain &apos;t happy , ain &apos;t nobody happy .',
 'that &apos;s the theme of my talk .',
 'and we &apos;re making the ocean pretty unhappy in a lot of different ways .',
 'this is a shot of cannery row in 1932 .',
 'cannery row , at the time , had the biggest indust

In [20]:
de_data = []
en_data = []
for data in iwslt_dataset["train"]["translation"]:
    de_data.append(data["de"].lower())
    en_data.append(data["en"].lower())

KeyError: "Column train not in the dataset. Current columns in the dataset: ['translation']"

In [24]:
total_data = 500_000
random_idx = random.sample(range(0, 38_690_334), total_data - 160_239)

for idx, data in enumerate(wmt_dataset["train"]["translation"]):
    if idx in random_idx:
        de_data.append(data["de"])
        en_data.append(data["en"])

In [None]:
len(de_data), len(en_data)