Using data for *burn, rot, and spot* classes from <a href="https://universe.roboflow.com/hehe-ngdal/orchid-leaf-spots-blujx"> roboflow</a>, while *chlorosis, and powdery mildew* data from <a href="https://www.kaggle.com/datasets/aibuzz/apple-leaf-disease-powdery-mildew"> Kaggle</a>. 

In [31]:
import os

# Path to data for finetuning 
labels_dir = "Dataset/train/labels"
imgs_dir = "Dataset/train/images"

# View the file names of 10 first labels
for i, fn in enumerate(os.listdir(labels_dir)):
    if fn.endswith(".txt") * i<10:
        print(fn)
    else:
        break

-_100_jpeg.rf.2f3091f5b4dfeaca9c096eb98d450482.txt
-_100_jpeg.rf.34152c110e8bc75e21c90b7eeaf6aa37.txt
-_100_jpeg.rf.37083ef3b826602cbf42fafbf2ac4c73.txt
023d6156-d789-4f46-ac5b-f764d6624f99_jpg.rf.15b750aff286f23aa9c9e8dd5b64bc48.txt
023d6156-d789-4f46-ac5b-f764d6624f99_jpg.rf.adeeef29fbf1f8cad9222e88d75f6d1f.txt
023d6156-d789-4f46-ac5b-f764d6624f99_jpg.rf.da4d32ccafb0b0812648e0a3110ad520.txt
100_jpeg.rf.5a7f76eeef6a04ac0d419e4d3d65b1ec.txt
100_jpeg.rf.60d1ba1fe865964c30aa7cd9c42cdd27.txt
100_jpeg.rf.85a9c76be79ccce93481c1d15c65105e.txt
101_jpg.rf.44b1ee63a37cfe04ec550c9b69df247c.txt


In [32]:
# Load the label and image file names
label_fns = [fn for fn in  os.listdir(labels_dir) if fn.endswith(".txt")]
img_fns = [fn for fn in  os.listdir(imgs_dir) if fn.endswith(".jpg")]

print(f"Number of label files: { len(label_fns)}")
print(f"Number of image files: { len(label_fns)}")

Number of label files: 1551
Number of image files: 1551


In [33]:
# Open the first label file for inspection
first_label = os.path.join(labels_dir, label_fns[0])

with open(first_label) as f:
    lines = f.readlines()

for line in lines:
    content = line.strip().split()
    print(f"Content of the first label file: {content}")
    print(f"Class ID is the first element in each line: {content[0]}")

Content of the first label file: ['0', '0.746875', '0.69140625', '0.11015625', '0.0734375']
Class ID is the first element in each line: 0


In [34]:
# Use shutil to move the images to their specified folders
import shutil

# Paths to image classes
output_burn = "Dataset/burn"  # class 0
output_rot  = "Dataset/rot"   # class 1
output_spot = "Dataset/spot"  # class 2

# Create output folders if they don't exist
os.makedirs(output_burn, exist_ok=True)
os.makedirs(output_rot, exist_ok=True)
os.makedirs(output_spot, exist_ok=True)

# Map class ID to output folder
class_map = {
    '0': output_burn,
    '1': output_rot,
    '2': output_spot,
}

In [35]:
# Store valid image-label pairs
single_class_images = {
    '0': [],
    '1': [],
    '2': [],
}

for label_fn in label_fns:
    label_path = os.path.join(labels_dir, label_fn)

    with open(label_path, 'r') as f:
        lines = [line.strip() for line in f if line.strip()]

    # Skip if file is empty
    if not lines:
        continue

    # Get class IDs from all lines
    class_ids = [line.split()[0] for line in lines]

    # Check if all lines belong to the same class
    if len(set(class_ids)) == 1:
        class_id = class_ids[0]
        if class_id in class_map:
            image_name = os.path.splitext(label_fn)[0] + '.jpg'
            image_path = os.path.join(imgs_dir, image_name)
            if os.path.exists(image_path):
                single_class_images[class_id].append((image_path, image_name))

In [None]:
# Limit to 100 images per class (manual cleaning could reduce them)
limited_images = {}

for class_id, entries in single_class_images.items():
    limited_images[class_id] = entries[:100]  

0: Selected 100 images
1: Selected 100 images
2: Selected 100 images


In [37]:
# Copy selected images to respective folders
for class_id, entries in limited_images.items():
    dest_dir = class_map[class_id]
    for img_path, img_name in entries:
        dest_path = os.path.join(dest_dir, img_name)
        shutil.copy(img_path, dest_path)