# split_data.ipynb

This workbook splits raw data in a folder containing images (\*.jpg) and labels (\*.txt) into the folder structure expected by YOLO (See https://docs.ultralytics.com/datasets/detect/#ultralytics-yolo-format).

Note that images which do not have annotated objects are not included in the dataset. In other words, only images (\*.jpg) with a corresponding labels file (\*.txt) are included.

In [2]:
import os
import glob
import random
import shutil

In [6]:
RAWDATA = '../rawdata'  # folder containing images (*.jpg) and labels (*.txt)
DATASET = '../datasets/Guam07v1' # dataset folder

In [7]:
# create destinatio folders
os.makedirs(f'{DATASET}/images/train')
os.makedirs(f'{DATASET}/labels/train')
os.makedirs(f'{DATASET}/images/valid')
os.makedirs(f'{DATASET}/labels/valid')
os.makedirs(f'{DATASET}/images/test')
os.makedirs(f'{DATASET}/labels/test')

In [9]:
label_list = glob.glob(f'{RAWDATA}/*.txt')

# determine the number of images for each set
train_size = int(len(label_list) * 0.7)
val_size = int(len(label_list) * 0.15)
test_size = int(len(label_list) * 0.15)

# Shuffle label list 
random.seed(42)
random.shuffle(label_list)

# Copy image files to destination folders
for i, f in enumerate(label_list):
    if i < train_size:
        dest_folder = 'train'
    elif i < train_size + val_size:
        dest_folder = 'valid'
    else:
        dest_folder = 'test'
    
    label_source = label_list[i]    
    image_source = label_source.replace('.txt', '.jpg')
    label_dest = f'{DATASET}/labels/{dest_folder}/{os.path.basename(label_source)}'
    image_dest = f'{DATASET}/images/{dest_folder}/{os.path.basename(image_source)}'

    shutil.copy(label_source, label_dest)
    shutil.copy(image_source, image_dest)
    
print('FINISHED')

FINISHED
