In [2]:
import os
import shutil
from sklearn.model_selection import train_test_split

def split_data(original_folder, output_folder, train_size=0.7, val_size=0.15, test_size=0.15):
    """
    Splits data into training, validation, and testing sets while preserving class subfolders.

    Parameters:
    - original_folder: Path to the original dataset folder.
    - output_folder: Path to the output folder where the splits will be stored.
    - train_size: Proportion of data to include in the training set.
    - val_size: Proportion of data to include in the validation set.
    - test_size: Proportion of data to include in the testing set.

    The sum of train_size, val_size, and test_size must equal 1.0.
    """
    # Ensure the split sizes sum to 1.0
    assert abs(train_size + val_size + test_size - 1.0) < 1e-6, "Split sizes must sum to 1.0"

    # Get class names from the subfolders in the original dataset
    class_names = [d for d in os.listdir(original_folder) if os.path.isdir(os.path.join(original_folder, d))]

    # Create directories for splits
    for split in ['train', 'val', 'test']:
        split_path = os.path.join(output_folder, split)
        os.makedirs(split_path, exist_ok=True)
        for class_name in class_names:
            os.makedirs(os.path.join(split_path, class_name), exist_ok=True)

    # Process each class separately
    for class_name in class_names:
        class_path = os.path.join(original_folder, class_name)
        files = [f for f in os.listdir(class_path) if os.path.isfile(os.path.join(class_path, f))]
        file_paths = [os.path.join(class_path, f) for f in files]

        # Split files for test set
        train_val_files, test_files = train_test_split(
            file_paths, test_size=test_size, random_state=42
        )
        # Further split for validation set
        train_files, val_files = train_test_split(
            train_val_files, test_size=val_size / (train_size + val_size), random_state=42
        )

        # Copy files to respective folders
        for src in train_files:
            dst = os.path.join(output_folder, 'train', 'images', class_name, os.path.basename(src))
            shutil.copyfile(src, dst)
        for src in val_files:
            dst = os.path.join(output_folder, 'val', 'images', class_name, os.path.basename(src))
            shutil.copyfile(src, dst)
        for src in test_files:
            dst = os.path.join(output_folder, 'test', 'images', class_name, os.path.basename(src))
            shutil.copyfile(src, dst)

    print("Data successfully split into training, validation, and testing sets.")

# Replace these paths with your actual paths
original_folder = './classification_data/images'      # Path to your original dataset folder
output_folder = './classification_data/'   # Path where you want the splits to be stored

# Call the function with default split sizes (70% train, 15% val, 15% test)
split_data(original_folder, output_folder)


FileNotFoundError: [Errno 2] No such file or directory: './classification_data/train/images/fish_17/fish_000023950001_02301.png'