In [2]:
import os
import shutil
import random

# Define the target counts
FIELD_TARGET = 1700
WHITE_TARGET = 800

# Define the original data counts from your latest input
field_counts = {
    "Bacterial Blight": 3428,
    "Brown Spot": 3338,
    "Healthy": 2860,
    "Hispa": 3526,
    "Leaf Blast": 2581,
    "Sheath Blight": 2723,
    "Tungro": 5842
}

white_counts = {
    "Bacterial Blight": 2619,
    "Brown Spot": 6087,
    "Healthy": 6415,
    "Hispa": 765,
    "Leaf Blast": 5833,
    "Sheath Blight": 809,
    "Tungro": 939
}

# Paths to original and output directories (using raw strings for Windows paths)
FIELD_DATA_DIR = r"D:\CADT\CapstoneProjectI\ml__model\data\Rice Diseases\field background"
WHITE_DATA_DIR = r"D:\CADT\CapstoneProjectI\ml__model\data\Rice Diseases\white background"
OUTPUT_DATA_DIR = r"D:\CADT\CapstoneProjectI\ml__model\data\balance_data"

# Create output directory structure if it doesn’t exist
os.makedirs(OUTPUT_DATA_DIR, exist_ok=True)
for bg in ["field", "white"]:
    os.makedirs(os.path.join(OUTPUT_DATA_DIR, bg), exist_ok=True)
    for cls in field_counts.keys():
        os.makedirs(os.path.join(OUTPUT_DATA_DIR, bg, cls), exist_ok=True)

# Function to balance and copy images
def balance_and_copy(source_dir, target_dir, class_name, current_count, target_count):
    # Get all image files from the source directory
    source_path = os.path.join(source_dir, class_name)
    if not os.path.exists(source_path):
        print(f"Warning: {source_path} does not exist. Skipping.")
        return
    
    image_files = [f for f in os.listdir(source_path) if f.endswith(('.jpg', '.png', '.jpeg'))]
    if not image_files:
        print(f"Warning: No images found in {source_path}. Skipping.")
        return
    
    # Determine how many to pick
    num_to_pick = min(current_count, target_count)  # Use all if less than target
    selected_images = random.sample(image_files, min(len(image_files), num_to_pick))  # Randomly pick
    
    # Copy selected images to target directory
    target_path = os.path.join(target_dir, class_name)
    for img in selected_images:
        shutil.copy(os.path.join(source_path, img), os.path.join(target_path, img))
    
    print(f"{class_name} ({source_dir.split(os.sep)[-1]}): Copied {len(selected_images)} images (from {current_count})")

# Process each class for field and white backgrounds
for cls in field_counts.keys():
    # Field images
    balance_and_copy(
        source_dir=FIELD_DATA_DIR,
        target_dir=os.path.join(OUTPUT_DATA_DIR, "field"),
        class_name=cls,
        current_count=field_counts[cls],
        target_count=FIELD_TARGET
    )
    
    # White images (only if class exists in white_counts)
    if cls in white_counts:
        balance_and_copy(
            source_dir=WHITE_DATA_DIR,
            target_dir=os.path.join(OUTPUT_DATA_DIR, "white"),
            class_name=cls,
            current_count=white_counts[cls],
            target_count=WHITE_TARGET
        )

print("\nDataset balancing complete! Check the directory at:", OUTPUT_DATA_DIR)

Bacterial Blight (field background): Copied 1700 images (from 3428)
Bacterial Blight (white background): Copied 800 images (from 2619)
Brown Spot (field background): Copied 1700 images (from 3338)
Brown Spot (white background): Copied 800 images (from 6087)
Healthy (field background): Copied 1700 images (from 2860)
Healthy (white background): Copied 800 images (from 6415)
Hispa (field background): Copied 1700 images (from 3526)
Hispa (white background): Copied 765 images (from 765)
Leaf Blast (field background): Copied 1700 images (from 2581)
Leaf Blast (white background): Copied 800 images (from 5833)
Sheath Blight (field background): Copied 1700 images (from 2723)
Sheath Blight (white background): Copied 800 images (from 809)
Tungro (field background): Copied 1700 images (from 5842)
Tungro (white background): Copied 800 images (from 939)

Dataset balancing complete! Check the directory at: D:\CADT\CapstoneProjectI\ml__model\data\balance_data
