# load geometry3k


Instructions:
- If the dataset requires authentication, run `huggingface-cli login` in a terminal and login with your token before running the next cell.
- Run the code cell that follows to download the dataset metadata and save the first 20 images from the `train` split into `data/geometry3k_images`.
- You can change `N` in the code cell to save more or fewer images.

In [None]:
from datasets import load_dataset
from pathlib import Path
from PIL import Image
import io

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("hiyouga/geometry3k")

# Directory to save images
out_dir = Path("data/geometry3k_images")
out_dir.mkdir(parents=True, exist_ok=True)

def save_image_field(img_field, out_path):
    """Save an image field from the dataset to disk.
    Supports PIL Image, bytes, and URL strings (if already loaded)."""
    if img_field is None:
        return False
    # If already a PIL Image
    if isinstance(img_field, Image.Image):
        img_field.save(out_path)
        return True
    # If bytes (e.g., b'\x89PNG...')
    if isinstance(img_field, (bytes, bytearray)):
        try:
            Image.open(io.BytesIO(img_field)).save(out_path)
            return True
        except Exception:
            return False
    # If it's a dict with 'bytes' or 'array' fields (common in datasets Image)
    if isinstance(img_field, dict):
        # try 'bytes' then 'array' (numpy) conversions
        if 'bytes' in img_field and img_field['bytes'] is not None:
            try:
                Image.open(io.BytesIO(img_field['bytes'])).save(out_path)
                return True
            except Exception:
                pass
        if 'array' in img_field and img_field['array'] is not None:
            try:
                Image.fromarray(img_field['array']).save(out_path)
                return True
            except Exception:
                pass
    # If it's a string, it may be a local path or URL already downloaded by the dataset
    if isinstance(img_field, str):
        try:
            # try opening as a file path
            Image.open(img_field).save(out_path)
            return True
        except Exception:
            return False
    return False

# Save first N images from train split as an example
N = 20
saved = 0
for i, item in enumerate(ds['train']):
    if saved >= N:
        break
    images = item.get('images') if isinstance(item, dict) else None
    if not images:
        continue
    # images can be a list; iterate
    if isinstance(images, (list, tuple)):
        for j, im in enumerate(images):
            out_path = out_dir / f'train_{i:06d}_{j}.png'
            if save_image_field(im, out_path):
                saved += 1
                if saved >= N:
                    break
    else:
        out_path = out_dir / f'train_{i:06d}.png'
        if save_image_field(images, out_path):
            saved += 1

print(f'Saved {saved} images to {out_dir}')

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 2101/2101 [00:00<00:00, 7253.44 examples/s]
Generating validation split: 100%|██████████| 300/300 [00:00<00:00, 10717.80 examples/s]
Generating test split: 100%|██████████| 601/601 [00:00<00:00, 12539.49 examples/s]


In [6]:
ds['train'][1]['images'].

AttributeError: 'list' object has no attribute 'save'