In [1]:
import os
import shutil
import random
import pandas as pd

In [3]:
# Paths
source_dir = "C:/Users/DELL/Desktop/Computer_Vision_Projects/cotopo_plant_dx/data/raw"   # your raw dataset (one folder per class)
target_dir = "C:/Users/DELL/Desktop/Computer_Vision_Projects/cotopo_plant_dx/data/processed"  # output dataset with train/ and test/
train_split = 0.8  # 80% train, 20% test

os.makedirs(os.path.join(target_dir, "train"), exist_ok=True)
os.makedirs(os.path.join(target_dir, "test"), exist_ok=True)

In [4]:
records = []

for class_name in os.listdir(source_dir):
    class_path = os.path.join(source_dir, class_name)
    if not os.path.isdir(class_path):
        continue
    
    # Split into crop + disease
    if "_" in class_name:
        crop, disease = class_name.split("___", 1)
    else:
        crop, disease = class_name, "Unknown"
    
    os.makedirs(os.path.join(target_dir, "train", class_name), exist_ok=True)
    os.makedirs(os.path.join(target_dir, "test", class_name), exist_ok=True)
    
    images = os.listdir(class_path)
    random.shuffle(images)
    
    split_idx = int(len(images) * train_split)
    train_imgs = images[:split_idx]
    test_imgs = images[split_idx:]
    
    for img in train_imgs:
        src = os.path.join(class_path, img)
        dst = os.path.join(target_dir, "train", class_name, img)
        shutil.copy(src, dst)
        records.append([f"train/{class_name}/{img}", crop, disease, class_name])
    
    for img in test_imgs:
        src = os.path.join(class_path, img)
        dst = os.path.join(target_dir, "test", class_name, img)
        shutil.copy(src, dst)
        records.append([f"test/{class_name}/{img}", crop, disease, class_name])

# Convert to DataFrame
df = pd.DataFrame(records, columns=["image_path", "crop", "disease", "full_label"])

# Assign numeric IDs
df["label_id"] = df["full_label"].astype("category").cat.codes

In [None]:
# Save CSV
df.to_csv(os.path.join(target_dir, "labels.csv"), index=False)

print("Train/Test split done")
print("labels.csv saved with numeric label_id")
