In [1]:
from transformers import DonutProcessor
from datasets import load_dataset, Dataset
from PIL import Image
from tqdm import tqdm
import json

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("naver-clova-ix/cord-v1")

In [4]:
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
tokenizer = processor.tokenizer

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [5]:
    added_tokens = set()
    def json2token(obj, update_special_tokens_for_json_key, sort_json_key: bool = True):
        """
        Convert an ordered JSON object into a token sequence
        """
        if type(obj) == dict:
            if len(obj) == 1 and "text_sequence" in obj:
                return obj["text_sequence"]
            else:
                output = ""
                if sort_json_key:
                    keys = sorted(obj.keys(), reverse=True)
                else:
                    keys = obj.keys()
                for k in keys:
                    if update_special_tokens_for_json_key:
                        tokenizer.add_tokens([fr"<s_{k}>", fr"</s_{k}>"])
                        added_tokens.add(fr"<s_{k}>")
                        added_tokens.add(fr"</s_{k}>")
                    output += (
                        fr"<s_{k}>"
                        + json2token(obj[k], update_special_tokens_for_json_key, sort_json_key)
                        + fr"</s_{k}>"
                    )
                return output
        elif type(obj) == list:
            return r"<sep/>".join(
                [json2token(item, update_special_tokens_for_json_key, sort_json_key) for item in obj]
            )
        else:
            obj = str(obj)
            if obj in added_tokens:
                obj = f"<{obj}/>"  # for categorical special tokens
            return obj

In [6]:
image_id = 0
for split in ["test", "train", "validation"]:
    with open(f"cord_{split}.jsonl", 'w') as f:
        dataset_split = dataset[split]
        ground_truths = dataset_split["ground_truth"] 
        images = dataset_split["image"]
        sz = len(ground_truths)
        for i in tqdm(range(sz)):
            line = {}
            line["task"] = "cord"
            images[i].save(f"cord_images/{image_id}.jpg")
            line["image_path"] = f"cord_images/{image_id}.jpg"
            image_id += 1
            line["ground_truth"] = json.dumps(json.loads(ground_truths[i])["gt_parse"])
            gt_tokens = json2token(json.loads(ground_truths[i])["gt_parse"], True)
            labels = tokenizer(gt_tokens + "</s>", add_special_tokens=False).input_ids
            line["labels"] = labels
            if split == "train":
                line["input_ids"] = tokenizer("<s>" + gt_tokens, add_special_tokens=False).input_ids
            else:
                line["input_ids"] = tokenizer("<s>", add_special_tokens=False).input_ids
            f.write(json.dumps(line) + "\n")

100%|██████████| 100/100 [00:06<00:00, 14.78it/s]
100%|██████████| 800/800 [00:58<00:00, 13.60it/s]
100%|██████████| 100/100 [00:07<00:00, 13.92it/s]


In [25]:
tokenizer.save_pretrained("cord_tokenizer")

('cord_tokenizer/tokenizer_config.json',
 'cord_tokenizer/special_tokens_map.json',
 'cord_tokenizer/sentencepiece.bpe.model',
 'cord_tokenizer/added_tokens.json',
 'cord_tokenizer/tokenizer.json')

In [7]:
ds = load_dataset("json", data_files={"train": "cord_train.jsonl", "test": "cord_test.jsonl", "validation": "cord_validation.jsonl"})

Downloading data files: 100%|██████████| 3/3 [00:00<00:00, 6105.25it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 1361.49it/s]
Generating train split: 800 examples [00:00, 40829.42 examples/s]
Generating test split: 100 examples [00:00, 23248.73 examples/s]
Generating validation split: 100 examples [00:00, 19240.81 examples/s]


In [20]:
ds.push_to_hub("cord_donut_multitask", token="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")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 122.11ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.02it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 563.83ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:09<00:00,  9.13s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 509.08ba/s]
'(MaxRetryError("HTTPSConnectionPool(host='hf-hub-lfs-us-east-1.s3.us-east-1.amazonaws.com', port=443): Max retries exceeded with url: /repos/da/2e/da2e3325cca3dc46dd681a6573ef98c743e93c1467ba69d8ec0b5520ebdefdd6/8d611b70910dd56ea3a3a8c433add419a58dd10b40fbbe9c19d3cb5a06e8cb91?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQFN2FTF47%2F20231107%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20231107T044451Z&X-Amz-Expires=900&X-Amz-Signature=0d6eda50e496f47cdb9e7179dc3d7951ec577a024eaaba881e299769e44d9d5b&X-Amz-SignedHead