In [4]:
import os
import shutil
import random

# Define paths
dataset_dir = "D:\Capstone Project\Leaf_coffe_augmented"
output_dir = "D:\Capstone Project\Final coffee leaf_Dataset"
categories = ["Cerscospora", "Healthy", "Miner", "Phoma", "Leaf rust"]

# Create train, val, and test folders
for split in ["train", "val", "test"]:
    for category in categories:
        os.makedirs(os.path.join(output_dir, split, category), exist_ok=True)

# Train/Val split ratio
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1  # Only applies to original images

# Process each category
for category in categories:
    category_path = os.path.join(dataset_dir, category)
    images = [img for img in os.listdir(category_path) if img.lower().endswith(('.png', '.jpg', '.jpeg'))]

    # Identify original images (without "_aug" in name)
    original_images = [img for img in images if "aug_" not in img]
    augmented_images = [img for img in images if "aug_" in img]

    # Shuffle images for randomness
    random.shuffle(original_images)
    random.shuffle(augmented_images)

    # Split original images: 10% to test set
    num_test = int(test_ratio * len(images))  # 10% of total dataset (original only)
    test_images = original_images[:num_test]  # First 10% go to test set

    for img in test_images:
        shutil.copy(os.path.join(category_path, img), os.path.join(output_dir, "test", category, img))

    # Combine remaining original + augmented images for train/val split
    remaining_images = original_images[num_test:] + augmented_images  # Remove test images from originals

    # Shuffle again before final split
    random.shuffle(remaining_images)

    num_train = int(train_ratio * len(remaining_images))
    num_val = len(remaining_images) - num_train  # Remaining go to validation

    # Move images to respective folders
    for img in remaining_images[:num_train]:
        shutil.copy(os.path.join(category_path, img), os.path.join(output_dir, "train", category, img))

    for img in remaining_images[num_train:]:
        shutil.copy(os.path.join(category_path, img), os.path.join(output_dir, "val", category, img))

print("Dataset split completed successfully!")


Dataset split completed successfully!


In [5]:
import os
import shutil
import random

# Define paths
dataset_dir = "D:\Capstone Project\Leaf_coffe_dataset"
output_dir = "D:\Capstone Project\Final coffee leaf_Dataset"
categories = ["Cerscospora", "Healthy", "Miner", "Phoma", "Leaf rust"]

# Create train, val, and test folders
for split in ["train", "val", "test"]:
    for category in categories:
        os.makedirs(os.path.join(output_dir, split, category), exist_ok=True)

# Existing dataset distribution per class
CURRENT_TRAIN = 21600
CURRENT_VAL = 5400
CURRENT_TEST = 3000  # Already correct

# Desired distribution per class
DESIRED_TRAIN = 24000
DESIRED_VAL = 3000
DESIRED_TEST = 3000  # No change

# Process each category
for category in categories:
    train_path = os.path.join(output_dir, "train", category)
    val_path = os.path.join(output_dir, "val", category)

    # Get existing images
    train_images = os.listdir(train_path)
    val_images = os.listdir(val_path)

    # Move 2,400 images from val to train
    random.shuffle(val_images)
    move_to_train = val_images[:(DESIRED_TRAIN - CURRENT_TRAIN)]
    new_val_images = val_images[(DESIRED_TRAIN - CURRENT_TRAIN):]  # Remaining for val

    for img in move_to_train:
        shutil.move(os.path.join(val_path, img), os.path.join(train_path, img))

    # Ensure only 3,000 images remain in val
    for img in new_val_images[:DESIRED_VAL]:
        shutil.move(os.path.join(val_path, img), os.path.join(output_dir, "val", category, img))

print("Dataset distribution corrected to exact 80/10/10!")


Dataset distribution corrected to exact 80/10/10!
