In [1]:
import os
import random
import shutil
from glob import glob

# --- CONFIGURATION ---
DATA_ROOT = 'data set/helmet-dataset'
SPLIT_RATIO = 0.8  # 80% for training
IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.webp', '.bmp'] 
# ---------------------

# Define paths
IMAGES_DIR = os.path.join(DATA_ROOT, 'images')
LABELS_DIR = os.path.join(DATA_ROOT, 'labels')
TRAIN_IMG_DIR = os.path.join(IMAGES_DIR, 'train')
VAL_IMG_DIR = os.path.join(IMAGES_DIR, 'val')
TRAIN_LBL_DIR = os.path.join(LABELS_DIR, 'train')
VAL_LBL_DIR = os.path.join(LABELS_DIR, 'val')

# --- 0. Clean Up (Resetting Data) ---
def cleanup():
    print("--- Starting Cleanup: Resetting all files to root directories... ---")
    
    folders_to_clean = [TRAIN_IMG_DIR, VAL_IMG_DIR, TRAIN_LBL_DIR, VAL_LBL_DIR]
    
    for folder in folders_to_clean:
        if os.path.exists(folder):
            is_label_folder = 'labels' in folder
            root_dest = LABELS_DIR if is_label_folder else IMAGES_DIR
            
            for f_name in os.listdir(folder):
                src_path = os.path.join(folder, f_name)
                dst_path = os.path.join(root_dest, f_name)
                
                if not os.path.exists(dst_path):
                    shutil.move(src_path, dst_path)
            
            if not os.listdir(folder):
                try:
                    os.rmdir(folder)
                except OSError: # Folder is part of the data set folder, may not be empty 
                    pass
    print("Cleanup complete. Files should be back in the root 'images' and 'labels' folders.")


# --- 1. Get List of Valid Pairs ---
def get_valid_pairs():
    # Only select non-empty .txt files
    label_basenames = [os.path.splitext(f)[0] for f in os.listdir(LABELS_DIR) if f.endswith('.txt') and os.path.getsize(os.path.join(LABELS_DIR, f)) > 0]
    
    if not label_basenames:
        print(f"Error: No valid (non-empty) .txt label files found in {LABELS_DIR}.")
        return []

    paired_files = []
    for basename in label_basenames:
        found_ext = None
        for ext in IMAGE_EXTENSIONS:
            if os.path.exists(os.path.join(IMAGES_DIR, basename + ext)):
                found_ext = ext
                break
                
        if found_ext:
            paired_files.append((basename, found_ext))
        
    return paired_files

# --- 2. Move Files ---
def move_files(pair_list, target_split):
    # Ensure the destination folders exist
    os.makedirs(os.path.join(IMAGES_DIR, target_split), exist_ok=True)
    os.makedirs(os.path.join(LABELS_DIR, target_split), exist_ok=True)
    
    for basename, ext in pair_list:
        img_src = os.path.join(IMAGES_DIR, basename + ext)
        lbl_src = os.path.join(LABELS_DIR, basename + '.txt')

        img_dst = os.path.join(IMAGES_DIR, target_split, basename + ext)
        lbl_dst = os.path.join(LABELS_DIR, target_split, basename + '.txt')
            
        shutil.move(img_src, img_dst)
        shutil.move(lbl_src, lbl_dst)

# --- MAIN EXECUTION ---
if __name__ == '__main__':
    cleanup()
    
    paired_files = get_valid_pairs()
    
    if not paired_files:
        print("\nSplit failed. Please verify conversion and XML class names.")
        exit()
        
    random.shuffle(paired_files)

    train_count = int(len(paired_files) * SPLIT_RATIO)
    train_pairs = paired_files[:train_count]
    val_pairs = paired_files[train_count:]
        
    print(f"\n--- Starting Split ---")
    print(f"Total valid image/label pairs: {len(paired_files)}")
    print(f"Train Set Size: {len(train_pairs)}")
    print(f"Validation Set Size: {len(val_pairs)}")

    move_files(train_pairs, 'train')
    move_files(val_pairs, 'val')

    print("\nData splitting complete. Your files are ready for Colab!")

--- Starting Cleanup: Resetting all files to root directories... ---
Cleanup complete. Files should be back in the root 'images' and 'labels' folders.

--- Starting Split ---
Total valid image/label pairs: 761
Train Set Size: 608
Validation Set Size: 153

Data splitting complete. Your files are ready for Colab!
