In [1]:
import os
import glob
import json
from datasets import Dataset, Features, Value, Image as HFImage

def create_image_dataset(base_dir, mode="json"):
    """
    Creates a HuggingFace dataset from images and optional label/metadata files.

    Args:
        base_dir (str): Directory containing image files and corresponding label files
        mode (str): One of ['json', 'txt', 'image_only']

    Returns:
        datasets.Dataset: A HuggingFace dataset
    """
    assert mode in {"json", "txt", "image_only"}, f"Unsupported mode: {mode}"

    images = []
    filenames = []
    extra_columns = {}

    # Collect image paths
    image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.webp']
    image_files = []
    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(base_dir, ext)))
        image_files.extend(glob.glob(os.path.join(base_dir, ext.upper())))

    for image_file in image_files:
        base_name = os.path.splitext(os.path.basename(image_file))[0]

        images.append(image_file)
        filenames.append(os.path.basename(image_file))

        if mode == "txt":
            txt_file = os.path.join(base_dir, f"{base_name}.txt")
            label = ""
            if os.path.exists(txt_file):
                try:
                    with open(txt_file, 'r') as f:
                        label = f.read().strip()
                except Exception as e:
                    print(f"Error reading {txt_file}: {e}")
            extra_columns.setdefault("label", []).append(label)

        elif mode == "json":
            json_file = os.path.join(base_dir, f"{base_name}.json")
            if os.path.exists(json_file):
                try:
                    with open(json_file, 'r') as f:
                        data = json.load(f)
                    for k, v in data.items():
                        extra_columns.setdefault(k, []).append(str(v))
                    for k in extra_columns:
                        if k not in data:
                            extra_columns[k].append("")
                except Exception as e:
                    print(f"Error reading {json_file}: {e}")
                    for k in extra_columns:
                        extra_columns[k].append("")
            else:
                for k in extra_columns:
                    extra_columns[k].append("")

    # Assemble dataset dict
    dataset_dict = {
        "image": images,
        "filename": filenames,
        **extra_columns
    }

    # Define features
    features = {
        "image": HFImage(),
        "filename": Value("string")
    }
    for k in extra_columns:
        features[k] = Value("string")

    dataset = Dataset.from_dict(dataset_dict)
    dataset = dataset.cast(Features(features))
    return dataset

if __name__ == "__main__":
    base_dir = "/local/yada/apps/SimpleTuner-a/data/b74444"
    mode = "txt"  # options: "json", "txt", "image_only"

    dataset = create_image_dataset(base_dir, mode=mode)
    
    print(f"Dataset created with {len(dataset)} images")
    print(f"Columns: {dataset.column_names}")

    dataset.push_to_hub("incantor/qft-b74444-anime-400p", private=True)


Casting the dataset:   0%|          | 0/447 [00:00<?, ? examples/s]

Dataset created with 447 images
Columns: ['image', 'filename', 'label']


Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/149 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Map:   0%|          | 0/149 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Map:   0%|          | 0/149 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]