In [1]:
import os, shutil
from tqdm import tqdm
import pandas as pd
import zipfile
from huggingface_hub import hf_hub_download, snapshot_download

num_images_to_download = 5000
out_dir = "/Users/tsigall/repositories/WatchdogAI/data/ai_posters"
repo_id = "poloclub/diffusiondb"
repo_type = "dataset"

parquet_path = hf_hub_download(
    repo_id=repo_id,
    filename="metadata.parquet",
    repo_type=repo_type
)

meta = pd.read_parquet(parquet_path,
                     columns=["image_name", "prompt", "part_id"])

mask = meta["prompt"].str.contains(r"\bposter\b", case=False, na=False)
posters = meta[mask].reset_index(drop=True)

print(f"Found {len(posters)} candidate poster prompts")

Found 36171 candidate poster prompts


In [2]:
sampled = posters.sample(n=num_images_to_download, random_state=621)

required_part_ids = sampled['part_id'].unique()
zip_patterns = [f"part-{int(pid):06d}.zip" for pid in required_part_ids]
print(f"Identified {len(zip_patterns)} zip files needed for {num_images_to_download} images.")



cache_dir = snapshot_download(
    repo_id=repo_id,
    repo_type=repo_type,
    allow_patterns=zip_patterns,
    local_dir_use_symlinks=False, # Keep this for reliability
)
print(f"Zip files downloaded to cache: {cache_dir}")
# Assume zips are in a subdirectory based on previous info
zip_location_in_cache = os.path.join(cache_dir, "diffusiondb-large-part-1")
if not os.path.isdir(zip_location_in_cache):
    zip_location_in_cache = cache_dir # Fallback
print(f"Looking for zips in: {zip_location_in_cache}")

Identified 1600 zip files needed for 5000 images.


Fetching 0 files: 0it [00:00, ?it/s]

Zip files downloaded to cache: /Users/tsigall/.cache/huggingface/hub/datasets--poloclub--diffusiondb/snapshots/fb620fbe49fa4420e0734bd9c0df11f51176b61f
Looking for zips in: /Users/tsigall/.cache/huggingface/hub/datasets--poloclub--diffusiondb/snapshots/fb620fbe49fa4420e0734bd9c0df11f51176b61f/diffusiondb-large-part-1


In [3]:
os.makedirs(out_dir, exist_ok=True)
extracted_count = 0
print(f"Extracting {len(sampled)} images to {out_dir}...")

for _, row in tqdm(sampled.iterrows(), total=len(sampled), desc="Extracting"):
    part_id = int(row['part_id'])
    image_name = row['image_name']

    zip_filename = f"part-{part_id:06d}.zip"
    zip_file_path = os.path.join(zip_location_in_cache, zip_filename)

    # Path of the image *inside* the zip archive
    image_path_in_zip = f"images/part-{part_id:06d}/{image_name}"

    # Destination path for the extracted image
    output_image_path = os.path.join(out_dir, image_name)

    # Simplified extraction - assumes zip exists and contains the image
    if os.path.exists(zip_file_path) and not os.path.exists(output_image_path):
        try:
            with zipfile.ZipFile(zip_file_path, 'r') as zf:
                with zf.open(image_path_in_zip) as source, open(output_image_path, "wb") as target:
                    shutil.copyfileobj(source, target)
                extracted_count += 1
        except (KeyError, zipfile.BadZipFile, FileNotFoundError) as e:
             # Catch specific, common extraction errors but keep it brief
             # print(f"Skipping {image_name}: Error accessing {zip_filename} or image within. Details: {e}")
             pass # Silently skip on error in concise version
        except Exception as e:
             # Catch unexpected errors during extraction
             print(f"Warning: Unexpected error extracting {image_name}: {e}")


print(f"\nExtraction complete. Successfully extracted {extracted_count} new images.")


Extracting 5000 images to /Users/tsigall/repositories/WatchdogAI/data/ai_posters...


Extracting: 100%|██████████| 5000/5000 [00:00<00:00, 8060.44it/s]


Extraction complete. Successfully extracted 0 new images.



