In [1]:
from datasets import load_dataset, Dataset, Features, Image, Value
from pathlib import Path
import json

from huggingface_hub import HfApi

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
records = []
for line in Path("metadata.jsonl").read_text().splitlines():
    item = json.loads(line)
    records.append({
        "image": item["image"],                     # target image
        "conditioning_image": item["conditioning_image"],  # conditioning image
        "text": item["text"],
        "negative_prompt": item.get("negative_prompt", "")
    })

# Define features explicitly
features = Features({
    "image": Image(mode="RGB"),
    "conditioning_image": Image(mode="RGB"),
    "text": Value("string"),
    "negative_prompt": Value("string")
})


In [3]:
ds = Dataset.from_list(records, features=features)

In [5]:
print(ds.features)

{'image': Image(mode='RGB', decode=True), 'conditioning_image': Image(mode='RGB', decode=True), 'text': Value('string'), 'negative_prompt': Value('string')}


In [6]:
ds.push_to_hub(
    repo_id="justacoderwhocodes/snacksy_enh_12k_1300_celebs",
    split="train",
    private=False,
    commit_message="Trying something else"
)


Map: 100%|██████████| 3334/3334 [00:01<00:00, 1913.60 examples/s]ards/s]
Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 11.69ba/s]
Processing Files (1 / 1): 100%|██████████|  437MB /  437MB, 87.8MB/s  
New Data Upload: 100%|██████████|  495kB /  495kB,  108kB/s  
Map: 100%|██████████| 3333/3333 [00:01<00:00, 1855.55 examples/s]13.81s/ shards]
Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 11.36ba/s]
Processing Files (1 / 1): 100%|██████████|  437MB /  437MB, 86.6MB/s  
New Data Upload: 100%|██████████|  625kB /  625kB,  184kB/s  
Map: 100%|██████████| 3333/3333 [00:01<00:00, 1894.46 examples/s]12.90s/ shards]
Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 12.15ba/s]
Processing Files (1 / 1): 100%|██████████|  436MB /  436MB, 92.1MB/s  
New Data Upload: 100%|██████████| 1.19MB / 1.19MB,  373kB/s  
Uploading the dataset shards: 100%|██████████| 3/3 [00:37<00:00, 12.63s/ shards]


CommitInfo(commit_url='https://huggingface.co/datasets/justacoderwhocodes/snacksy_enh_12k_1300_celebs/commit/19d29c71f0d9cf148e9e4bfb6c64b6e7d42ce44c', commit_message='Trying something else', commit_description='', oid='19d29c71f0d9cf148e9e4bfb6c64b6e7d42ce44c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/justacoderwhocodes/snacksy_enh_12k_1300_celebs', endpoint='https://huggingface.co', repo_type='dataset', repo_id='justacoderwhocodes/snacksy_enh_12k_1300_celebs'), pr_revision=None, pr_num=None)

In [15]:
check_ds = load_dataset("justacoderwhocodes/snacksy_enh_12k_1300_celebs", split="train")

Generating train split: 100%|██████████| 10000/10000 [00:00<00:00, 10230.97 examples/s]


In [16]:
check_ds.features

{'image': Image(mode='RGB', decode=True),
 'conditioning_image': Image(mode='RGB', decode=True),
 'text': Value('string'),
 'negative_prompt': Value('string')}