In [3]:
import os, shutil
import pandas as pd
import numpy as np
from huggingface_hub import hf_hub_download, snapshot_download

parquet_path = hf_hub_download(
    repo_id="poloclub/diffusiondb",
    filename="metadata.parquet",
    repo_type="dataset"
)

meta = pd.read_parquet(parquet_path,
                     columns=["image_name", "prompt", "part_id"])

mask = meta["prompt"].str.contains(r"\bposter\b", case=False, na=False)
posters = meta[mask].reset_index(drop=True)

print(f"Found {len(posters)} candidate poster prompts")

Found 36171 candidate poster prompts


In [None]:
sampled = posters.sample(n=5, random_state=42)  # Change to desired number

out_dir = "/Users/tsigall/repositories/WatchdogAI/data/ai_posters"
os.makedirs(out_dir, exist_ok=True)

patterns = [
    f"images/part-{int(pid):06d}/{img}" 
    for pid, img in zip(sampled.part_id, sampled.image_name)
]

print("Patterns to download:")
for pattern in patterns:
    print(pattern)

print(len(patterns), "patterns to download")

# Download only those files into a cache dir
cache_dir = snapshot_download(
    repo_id="poloclub/diffusiondb",
    repo_type="dataset"
)
print(f"Downloaded {len(os.listdir(cache_dir))} files to {cache_dir}")

Patterns to download:
images/part-000690/bce5ad7c-fc36-4fe7-902a-57ba31863903.png
images/part-001708/c1a63dda-60eb-4ce9-aa89-2ae9724797c7.png
images/part-001993/7f4ec28e-8cbb-4944-95b2-6a94de784d39.png
images/part-000976/49f0f46f-c46b-4927-8964-97884ff48f19.png
images/part-001302/587daf5e-68f4-43d0-9663-cb43c32d53db.png
5 patterns to download


Fetching 16006 files:   0%|          | 0/16006 [00:00<?, ?it/s]

.gitignore:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

part-000001.zip:   0%|          | 0.00/528M [00:00<?, ?B/s]

part-000002.zip:   0%|          | 0.00/512M [00:00<?, ?B/s]

part-000005.zip:   0%|          | 0.00/458M [00:00<?, ?B/s]

part-000004.zip:   0%|          | 0.00/451M [00:00<?, ?B/s]

part-000006.zip:   0%|          | 0.00/424M [00:00<?, ?B/s]

part-000003.zip:   0%|          | 0.00/524M [00:00<?, ?B/s]

part-000007.zip:   0%|          | 0.00/454M [00:00<?, ?B/s]

part-000008.zip:   0%|          | 0.00/530M [00:00<?, ?B/s]

part-000009.zip:   0%|          | 0.00/519M [00:00<?, ?B/s]

part-000010.zip:   0%|          | 0.00/459M [00:00<?, ?B/s]

part-000011.zip:   0%|          | 0.00/477M [00:00<?, ?B/s]

part-000012.zip:   0%|          | 0.00/476M [00:00<?, ?B/s]

part-000013.zip:   0%|          | 0.00/387M [00:00<?, ?B/s]

part-000014.zip:   0%|          | 0.00/401M [00:00<?, ?B/s]

part-000015.zip:   0%|          | 0.00/442M [00:00<?, ?B/s]

part-000016.zip:   0%|          | 0.00/417M [00:00<?, ?B/s]

part-000017.zip:   0%|          | 0.00/433M [00:00<?, ?B/s]

part-000018.zip:   0%|          | 0.00/490M [00:00<?, ?B/s]

part-000019.zip:   0%|          | 0.00/457M [00:00<?, ?B/s]

part-000020.zip:   0%|          | 0.00/429M [00:00<?, ?B/s]

part-000021.zip:   0%|          | 0.00/446M [00:00<?, ?B/s]

part-000022.zip:   0%|          | 0.00/417M [00:00<?, ?B/s]