In [2]:
import os
import shutil
import numpy as np
import random

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

def split_caltech_dogs_dataset(
    dataset_path='./dataset',
    output_path='./dogs_split',
    train_ratio=0.7,
    val_ratio=0.15
):
    """
    Split Caltech Dogs dataset with breed subfolders into train/val/test sets.
    
    Args:
        dataset_path: Path to dataset with images/ and annotation/ folders
        output_path: Where to save the split dataset
        train_ratio: Proportion for training (default: 0.7)
        val_ratio: Proportion for validation (default: 0.15)
    Returns:
        Path to the organized dataset
    """
    
    # Define paths
    images_dir = os.path.join(dataset_path, 'images')
    annotation_dir = os.path.join(dataset_path, 'annotation')
    
    # Check if directories exist
    if not os.path.exists(images_dir):
        raise FileNotFoundError(f"Images directory not found: {images_dir}")
    if not os.path.exists(annotation_dir):
        print(f"Warning: Annotation directory not found: {annotation_dir}")
    
    # Create output directory structure
    splits = ['train', 'val', 'test']
    for split in splits:
        os.makedirs(os.path.join(output_path, split, 'images'), exist_ok=True)
        os.makedirs(os.path.join(output_path, split, 'annotation'), exist_ok=True)
    
    # Get all breed folders
    breed_folders = sorted([d for d in os.listdir(images_dir) 
                           if os.path.isdir(os.path.join(images_dir, d))])
    
    print(f"Found {len(breed_folders)} breed folders")
    print("-" * 50)
    
    # Statistics
    total_train = 0
    total_val = 0
    total_test = 0
    
    # Process each breed
    for breed in breed_folders:
        # Get all image files in this breed folder
        breed_images_path = os.path.join(images_dir, breed)
        image_files = [f for f in os.listdir(breed_images_path) 
                      if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))]
        
        if not image_files:
            print(f"Warning: No images found in {breed}")
            continue
        
        # Shuffle the images
        random.shuffle(image_files)
        
        # Calculate split sizes
        n_total = len(image_files)
        n_train = int(n_total * train_ratio)
        n_val = int(n_total * val_ratio)
        n_test = n_total - n_train - n_val
        
        # Split the images
        train_images = image_files[:n_train]
        val_images = image_files[n_train:n_train + n_val]
        test_images = image_files[n_train + n_val:]
        
        # Copy images and annotation to split directories
        for split_name, split_images in [
            ('train', train_images),
            ('val', val_images),
            ('test', test_images)
        ]:
            # Create breed subdirectory in images
            img_dest_dir = os.path.join(output_path, split_name, 'images', breed)
            os.makedirs(img_dest_dir, exist_ok=True)
            
            # Create breed subdirectory in annotation
            anno_dest_dir = os.path.join(output_path, split_name, 'annotation', breed)
            os.makedirs(anno_dest_dir, exist_ok=True)
            
            # Copy each image and its annotation
            for img_file in split_images:
                # Copy image
                src_img = os.path.join(breed_images_path, img_file)
                dst_img = os.path.join(img_dest_dir, img_file)
                shutil.copy2(src_img, dst_img)
                
                # Copy annotation (if exists)
                # Extract base name without extension
                base_name = os.path.splitext(img_file)[0]
                
                # Look for annotation with the same base name
                anno_src_path = os.path.join(annotation_dir, breed)
                if os.path.exists(anno_src_path):
                    # Try different annotation extensions
                    for ext in ['.xml', '.json', '.mat', '.txt', '']:
                        anno_file = f"{base_name}{ext}"
                        anno_path = os.path.join(anno_src_path, anno_file)
                        
                        if os.path.exists(anno_path):
                            dst_anno = os.path.join(anno_dest_dir, anno_file)
                            shutil.copy2(anno_path, dst_anno)
                            break
        
        # Update statistics
        total_train += len(train_images)
        total_val += len(val_images)
        total_test += len(test_images)
        
        print(f"{breed}: {len(train_images)} train, {len(val_images)} val, {len(test_images)} test")
    
    print("-" * 50)
    print(f"TOTAL: {total_train} train, {total_val} val, {total_test} test")
    print(f"Split complete! Organized dataset saved to: {output_path}")
    
    return output_path

# Run the split
organized_path = split_caltech_dogs_dataset(
    dataset_path='./dataset',  # Change this to your actual path
    output_path='./dogs_split'
)

Found 120 breed folders
--------------------------------------------------
n02085620-Chihuahua: 106 train, 22 val, 24 test
n02085782-Japanese_spaniel: 129 train, 27 val, 29 test
n02085936-Maltese_dog: 176 train, 37 val, 39 test
n02086079-Pekinese: 104 train, 22 val, 23 test
n02086240-Shih-Tzu: 149 train, 32 val, 33 test
n02086646-Blenheim_spaniel: 131 train, 28 val, 29 test
n02086910-papillon: 137 train, 29 val, 30 test
n02087046-toy_terrier: 120 train, 25 val, 27 test
n02087394-Rhodesian_ridgeback: 120 train, 25 val, 27 test
n02088094-Afghan_hound: 167 train, 35 val, 37 test
n02088238-basset: 122 train, 26 val, 27 test
n02088364-beagle: 136 train, 29 val, 30 test
n02088466-bloodhound: 130 train, 28 val, 29 test
n02088632-bluetick: 119 train, 25 val, 27 test
n02089078-black-and-tan_coonhound: 111 train, 23 val, 25 test
n02089867-Walker_hound: 107 train, 22 val, 24 test
n02089973-English_foxhound: 109 train, 23 val, 25 test
n02090379-redbone: 103 train, 22 val, 23 test
n02090622-borzoi: