# Dataset Splitter

Use this to randomly partition your pre-labelled source images into Training, Validation and Test folders.

1. Ensure your images are in the 'SOURCE' folder as explained below.
2. Use the configurable variables below to choose the ratio you split your data by as well as limiting the selection from larger sub-sets.
3. Run the script and use the exported directories in your AutoML application of choice.

In [None]:
# Variables

# All images should be in the training directory
# split into sub-directories based on their class

image_dir_name = "MAIN"
source_dir_name = "SOURCE"
train_dir_name = "TRAIN"
validation_dir_name = "VALIDATION"
test_dir_name = "TEST"
validation_percent = 0
test_percent = 10

# Set how many times greater the larger dataset sample size
# should be compared to the smallest data set
set_multiple_limit = 1.5 

In [None]:
import os, errno
import sys
import math
import random
from shutil import copyfile

In [None]:
# A helper function for creating directories that don't exist

def checkAndCreateDirectories(directories):
    for directory in directories:
        try:
            if not os.path.exists(directory):
                os.makedirs(directory)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

In [None]:
# Define all of the base directories

cwd = os.getcwd()
image_dir = os.path.join(cwd, image_dir_name)
source_dir = os.path.join(image_dir,source_dir_name)
train_dir = os.path.join(image_dir,train_dir_name)
validation_dir = os.path.join(image_dir,validation_dir_name)
test_dir = os.path.join(image_dir,test_dir_name)

In [None]:
# Ensure all base directories have been created

checkAndCreateDirectories([image_dir, source_dir, train_dir, validation_dir, test_dir])

In [None]:
# Calculate number of images needed from each class

class_index = 0
num_images_by_class = {}
upper_limit = 0
classLabels = []

for className in os.listdir(source_dir):
    classDir = os.path.join(source_dir, className)
    if os.path.isdir(classDir) and not className.startswith("."):
        # Inside each class directory..
        classLabels.append(className)
        num_images_by_class[className] = 0

        
        for imageName in os.listdir(classDir):
            imageDir = os.path.join(classDir,imageName)
            if os.path.isfile(imageDir) and not className.startswith("."):
                # For each image..
                
                # Add to total number of images
                num_images_by_class[className] += 1
            
# Cap upper limit or set multiple limit
if set_multiple_limit:
    for className in num_images_by_class:
        if num_images_by_class[className] < upper_limit or upper_limit == 0:
            upper_limit = num_images_by_class[className]
    
    if set_multiple_limit:
        target_multiple = int(round(upper_limit * set_multiple_limit))
        
        for className in num_images_by_class:
            if num_images_by_class[className] > upper_limit and num_images_by_class[className] > target_multiple:
                num_images_by_class[className] = target_multiple
            
num_train_images_by_class = {}
num_validation_images_by_class = {}
num_test_images_by_class = {}
for className in num_images_by_class:
    
    validation_sample_num = round((validation_percent * 0.01) * num_images_by_class[className])
    test_sample_num = round((test_percent * 0.01) * num_images_by_class[className])
    train_sample_num = num_images_by_class[className] - validation_sample_num - test_sample_num
    
    
    
    num_train_images_by_class[className] = train_sample_num
    num_validation_images_by_class[className] = validation_sample_num
    num_test_images_by_class[className] = test_sample_num
    
    print(f"{className}\nTotal images: {num_images_by_class[className]}")
    print(f"Training sample size: {train_sample_num}")
    print(f"Validation sample size: {validation_sample_num}")
    print(f"Test sample size: {test_sample_num}\n")
    
    

In [None]:
# Copy images to their corresponding directories

numImagesCopiedTotal = 0
for className in os.listdir(source_dir):
    classDir = os.path.join(source_dir, className)
    if os.path.isdir(classDir) and not className.startswith("."):
        # Inside each class directory..
        classIndex = 0
        numTrain = 0
        numValid = 0
        numTest = 0
        
        # get list of images
        imageList = os.listdir(classDir)
        
        # randomise order of images in list
        random.shuffle(imageList) 

        for imageName in imageList:
            imageDir = os.path.join(classDir,imageName)
            
            if os.path.isfile(imageDir) and not className.startswith("."):
                # For each image..
                
                # Thresholds for each
                trainThreshold = num_train_images_by_class[className]
                validationThreshold = num_train_images_by_class[className] + num_validation_images_by_class[className]
                testThreshold = num_train_images_by_class[className] + num_validation_images_by_class[className] + num_test_images_by_class[className]
                
                if classIndex < trainThreshold:
                    # copy to train directory
                    destinationDir = os.path.join(train_dir,className)
                    destinationFile = os.path.join(train_dir,className,imageName)
                    numTrain += 1
                elif classIndex < validationThreshold:
                    # copy to validation directory
                    destinationDir = os.path.join(validation_dir,className)
                    destinationFile = os.path.join(validation_dir,className,imageName)
                    numValid += 1
                elif classIndex < testThreshold:
                    # copy to test directory
                    destinationDir = os.path.join(test_dir,className)
                    destinationFile = os.path.join(test_dir,className,imageName)
                    numTest += 1
                
                checkAndCreateDirectories([destinationDir])
                copyfile(imageDir, destinationFile)
                classIndex += 1
                numImagesCopiedTotal += 1
        
        print(f"{className} processed.")
    
print(f"Finished! A total of {numImagesCopiedTotal} images copied.")