## Split in a train and validation dataset

In [4]:
inputDir = r"C:\Users\faraboli\Desktop\BubbleID\BubbleIDGit\ProjetBubbleID\training\DATASETS\dataset_all_png\all" # Directory where are the images and labelme annotation to split
datasetFolderPath = r"C:\Users\faraboli\Desktop\BubbleID\BubbleIDGit\ProjetBubbleID\training\DATASETS\dataset_all_png" # path of the dataset folder
import shutil
import os, random
from tqdm import tqdm

def split_labelme_dataset(
    input_dir,
    output_dir,
    train_ratio=0.8,
    move_files=False
):
    """
    Sépare un dataset LabelMe (images + .json) en train/val.
    
    Args:
        input_dir (str): dossier contenant les images et leurs .json LabelMe
        output_dir (str): dossier racine où seront créés train/ et val/
        train_ratio (float): proportion du train (0.8 = 80%)
        move_files (bool): True = déplacer les fichiers, False = copier
    """
    os.makedirs(os.path.join(output_dir, "train"), exist_ok=True)
    os.makedirs(os.path.join(output_dir, "val"), exist_ok=True)

    # Liste des images (filtrées)
    images = [f for f in os.listdir(input_dir) if f.lower().endswith(('.jpg', '.png', '.jpeg'))]
    random.shuffle(images)

    n_train = int(len(images) * train_ratio)
    train_imgs = images[:n_train]
    val_imgs = images[n_train:]

    def copy_files(img_list, subset_name):
        for img_name in tqdm(img_list, desc=f"{subset_name}"):
            src_img = os.path.join(input_dir, img_name)
            src_json = os.path.splitext(src_img)[0] + ".json"

            dst_img = os.path.join(output_dir, subset_name, img_name)
            dst_json = os.path.join(output_dir, subset_name, os.path.basename(src_json))

            # Copier image
            if move_files:
                shutil.move(src_img, dst_img)
            else:
                shutil.copy2(src_img, dst_img)

            # Copier annotation
            if os.path.exists(src_json):
                if move_files:
                    shutil.move(src_json, dst_json)
                else:
                    shutil.copy2(src_json, dst_json)

    copy_files(train_imgs, "train")
    copy_files(val_imgs, "val")

    print(f"Split terminé :")
    print(f" - Train : {len(train_imgs)} images")
    print(f" - Val   : {len(val_imgs)} images")
    print(f" - Dossiers créés dans : {output_dir}")



split_labelme_dataset(
    input_dir=inputDir,   
    output_dir= datasetFolderPath,
    train_ratio=0.8,
    move_files=False
)


train: 100%|██████████| 380/380 [00:07<00:00, 49.92it/s]
val: 100%|██████████| 96/96 [00:01<00:00, 48.07it/s]

Split terminé :
 - Train : 380 images
 - Val   : 96 images
 - Dossiers créés dans : C:\Users\faraboli\Desktop\BubbleID\BubbleIDGit\ProjetBubbleID\training\DATASETS\dataset_all_png



