In [1]:
import os
import random
import shutil

In [2]:
def split_dataset(image_dir, label_dir, output_dir, train_ratio=0.8):
    """
    Splits the dataset into training and validation sets.

    Args:
        image_dir: Directory containing images.
        label_dir: Directory containing YOLO format label files.
        output_dir: Directory to save the split dataset.
        train_ratio: Ratio of images to use for training (default is 0.8).
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    train_image_dir = os.path.join(output_dir, 'images/train')
    val_image_dir = os.path.join(output_dir, 'images/val')
    train_label_dir = os.path.join(output_dir, 'labels/train')
    val_label_dir = os.path.join(output_dir, 'labels/val')

    os.makedirs(train_image_dir, exist_ok=True)
    os.makedirs(val_image_dir, exist_ok=True)
    os.makedirs(train_label_dir, exist_ok=True)
    os.makedirs(val_label_dir, exist_ok=True)

    images = [f for f in os.listdir(image_dir) if f.endswith('.jpg')]
    random.shuffle(images)
    
    train_count = int(len(images) * train_ratio)
    train_images = images[:train_count]
    val_images = images[train_count:]

    for image in train_images:
        shutil.copy(os.path.join(image_dir, image), train_image_dir)
        label = image.replace('.jpg', '.txt')
        shutil.copy(os.path.join(label_dir, label), train_label_dir)

    for image in val_images:
        shutil.copy(os.path.join(image_dir, image), val_image_dir)
        label = image.replace('.jpg', '.txt')
        shutil.copy(os.path.join(label_dir, label), val_label_dir)




In [3]:
# splitting dataset and labels txt for every image for model building
image_dir = 'data/'
label_dir = 'yolo_labels/'
output_dir = 'dataset'
split_dataset(image_dir, label_dir, output_dir)