In [1]:
from datasets import DatasetDict, Dataset
from io import BytesIO
from PIL import Image
from tqdm import tqdm
import base64
import json
import os

Using TensorFlow backend.





In [None]:
def getImageData(JsonFilePath):
    """Load image data from a JSON file containing base64 encoded image content."""
    try:
        with open(JsonFilePath, 'r', encoding="utf-8") as file:
            data = json.load(file)

        base64_image = data["pages"][0]["image"]["content"]
        image_data = base64.b64decode(base64_image)
        image = Image.open(BytesIO(image_data)).convert("RGB")
        return image

    except Exception as e:
        print(f"Error loading image from {JsonFilePath}: {e}")
        return None


In [None]:

def load_docling_split(split_dir):
    """Load a split of the Docling dataset from a directory."""
    data = []
    leaf_dirs = [os.path.join(root) for root, _, files in os.walk(split_dir) if len(files) == 2]

    for root in tqdm(leaf_dirs, desc=f"Loading split: {os.path.basename(split_dir)}"):
        image = None
        output = None

        for fname in os.listdir(root):
            fpath = os.path.join(root, fname)

            if fname.endswith(".json"):
                image = getImageData(fpath)

            elif fname.endswith(".txt"):
                with open(fpath, 'r', encoding="utf-8") as f:
                    output = f.read()

        if image is not None and output is not None:
            data.append({
                "image": image,
                "output": output
            })

    return Dataset.from_list(data)

print("Loading Dataset")

dataset = DatasetDict({
    "train": load_docling_split("./Data/train"),
    "validation": load_docling_split("./Data/test"),
})

dataset.save_to_disk("SmolDoclingDataNoLoc")

print("Done.")

Loading Dataset


Loading split: train: 100%|██████████| 366/366 [00:24<00:00, 14.72it/s]
Loading split: test: 100%|██████████| 87/87 [00:05<00:00, 14.65it/s]
Saving the dataset (5/5 shards): 100%|██████████| 366/366 [00:04<00:00, 85.71 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 87/87 [00:01<00:00, 81.46 examples/s]

Done.



