In [1]:
import os
import shutil
import numpy as np
from tqdm import tqdm

def split_dataset(
    input_dir,
    output_dir,
    defect_types,
    train_ratio=0.83,
    val_ratio=0.10,
    test_ratio=0.07,
    seed=42
):
    """
    Split dataset into train/val/test sets while preserving directory structure
    
    Args:
        input_dir: Path to original dataset (with images/ and masks/ folders)
        output_dir: Path to save split dataset
        defect_types: List of defect category folders
        train_ratio: Proportion of data for training
        val_ratio: Proportion of data for validation
        test_ratio: Proportion of data for testing
        seed: Random seed for reproducibility
    """
    # Validate ratios
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "Ratios must sum to 1"
    
    # Create output directories
    splits = ['train', 'val', 'test']
    for split in splits:
        for dtype in defect_types:
            os.makedirs(os.path.join(output_dir, split, 'images', dtype), exist_ok=True)
            os.makedirs(os.path.join(output_dir, split, 'masks', dtype), exist_ok=True)

    # Process each defect type
    for dtype in tqdm(defect_types, desc="Processing defect types"):
        # Get list of images
        img_dir = os.path.join(input_dir, 'images', dtype)
        mask_dir = os.path.join(input_dir, 'masks', dtype)
        
        images = [f for f in os.listdir(img_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        np.random.seed(seed)
        np.random.shuffle(images)
        
        # Calculate split indices
        total = len(images)
        train_end = int(train_ratio * total)
        val_end = train_end + int(val_ratio * total)
        
        # Split files
        train_files = images[:train_end]
        val_files = images[train_end:val_end]
        test_files = images[val_end:]
        
        # Copy files to split directories
        def copy_split(files, split):
            for f in tqdm(files, desc=f"Copying {split} files for {dtype}", leave=False):
                # Copy image
                src_img = os.path.join(img_dir, f)
                dst_img = os.path.join(output_dir, split, 'images', dtype, f)
                shutil.copy2(src_img, dst_img)
                
                # Copy mask (assumes same filename with .png extension)
                mask_name = os.path.splitext(f)[0] + '.png'
                src_mask = os.path.join(mask_dir, mask_name)
                if os.path.exists(src_mask):
                    dst_mask = os.path.join(output_dir, split, 'masks', dtype, mask_name)
                    shutil.copy2(src_mask, dst_mask)

        copy_split(train_files, 'train')
        copy_split(val_files, 'val')
        copy_split(test_files, 'test')

# Configuration
DEFECT_TYPES = [
    "Missing_hole",
    "Mouse_bite",
    "Open_circuit",
    "Short",
    "Spur",
    "Spurious_copper"
]

INPUT_DIR = "/kaggle/input/pcb-masks-images/pcb_dataset"
OUTPUT_DIR = "/kaggle/working/"

# Run the splitting
split_dataset(
    input_dir=INPUT_DIR,
    output_dir=OUTPUT_DIR,
    defect_types=DEFECT_TYPES,
    train_ratio=0.7,
    val_ratio=0.15,
    test_ratio=0.15,
    seed=42
)

print("Dataset splitting completed successfully!")

Processing defect types:   0%|          | 0/6 [00:00<?, ?it/s]
Copying train files for Missing_hole:   0%|          | 0/80 [00:00<?, ?it/s][A
Copying train files for Missing_hole:   6%|▋         | 5/80 [00:00<00:01, 39.37it/s][A
Copying train files for Missing_hole:  11%|█▏        | 9/80 [00:00<00:02, 35.12it/s][A
Copying train files for Missing_hole:  16%|█▋        | 13/80 [00:00<00:02, 32.11it/s][A
Copying train files for Missing_hole:  21%|██▏       | 17/80 [00:00<00:01, 33.47it/s][A
Copying train files for Missing_hole:  26%|██▋       | 21/80 [00:00<00:01, 31.97it/s][A
Copying train files for Missing_hole:  31%|███▏      | 25/80 [00:00<00:01, 33.80it/s][A
Copying train files for Missing_hole:  36%|███▋      | 29/80 [00:00<00:01, 35.02it/s][A
Copying train files for Missing_hole:  41%|████▏     | 33/80 [00:00<00:01, 35.62it/s][A
Copying train files for Missing_hole:  46%|████▋     | 37/80 [00:01<00:01, 33.99it/s][A
Copying train files for Missing_hole:  51%|█████▏    | 41/

Dataset splitting completed successfully!



