# Build Dataset Structure
Move all images from different datasets folder in a dataset structure with test, train and validation subfolders according to a given ratio.

In [None]:
# Source Datasets -------------------------------------------------------------
'''
[d1]
    [rock]
        image1, image2, ..., imageN
    [paper]
        image1, image2, ..., imageN
    [scissors]
        image1, image2, ..., imageN

[d2]
    [rock]
        image1, image2, ..., imageN
    [paper]
        image1, image2, ..., imageN
    [scissors]
        image1, image2, ..., imageN
'''

# Target Structure ------------------------------------------------------------
'''
[train]
    [rock]
        image1, image2, ..., imageN
    [paper]
        image1, image2, ..., imageN
    [scissors]
        image1, image2, ..., imageN

[test]
    [rock]
        image1, image2, ..., imageN
    [paper]
        image1, image2, ..., imageN
    [scissors]
        image1, image2, ..., imageN

[validation]
    [rock]
        image1, image2, ..., imageN
    [paper]
        image1, image2, ..., imageN
    [scissors]
        image1, image2, ..., imageN
'''

In [None]:
import os
import shutil
import random

root_path = 'dataset'
dataset_paths = ['d1', 'd2', 'd3']
classes = ['rock', 'paper', 'scissors']
steps = ['train', 'test', 'validation']
split_ratio = [0.6, 0.2, 0.2]  # relative split ratio to dispatch train, test and validation images
img_ext = ['jpg', 'jpeg', 'png']

# define ids to be used in the new dataset img names, to avoid overwriting
train_id = 0
test_id = 0
validation_id = 0
img_count = 0

try:
    # build structure
    for s in steps:
        for c in classes:
            os.makedirs(os.path.join(root_path, s, c))

    # copy files
    for d in dataset_paths:

        for c in classes:

            # list imaged in directory
            source_dir = os.path.join(root_path, d, c)
            img_list = [img for img in os.listdir(source_dir) if img.split('.')[-1].lower() in img_ext]
            img_len = len(img_list)

            # create 3 random lists of images for train, test and validation based on split ration
            train_img_len = int(img_len * split_ratio[0])
            train_img_list = random.sample(img_list, train_img_len)
            test_img_len = int(img_len * split_ratio[1])
            test_img_list = random.sample([_ for _ in img_list if _ not in train_img_list], test_img_len)
            validation_img_list = [_ for _ in img_list if _ not in train_img_list and _ not in test_img_list]

            # copy train images
            train_target_dir = os.path.join(root_path, 'train', c)
            for img in train_img_list:
                source_filepath = os.path.join(source_dir, img)
                ext = img.split('.')[-1]
                target_filepath = os.path.join(train_target_dir, f'{c}_{d}_train_{train_id}.{ext}')
                shutil.copy(source_filepath, target_filepath)
                print(f'{source_filepath} -> {target_filepath}')
                train_id += 1
                img_count += 1

            # copy test images
            test_target_dir = os.path.join(root_path, 'test', c)
            for img in test_img_list:
                source_filepath = os.path.join(source_dir, img)
                ext = img.split('.')[-1]
                target_filepath = os.path.join(test_target_dir, f'{c}_{d}_test_{test_id}.{ext}')
                shutil.copy(source_filepath, target_filepath)
                print(f'{source_filepath} -> {target_filepath}')
                test_id += 1
                img_count += 1

            # copy validation images
            validation_target_dir = os.path.join(root_path, 'validation', c)
            for img in validation_img_list:
                source_filepath = os.path.join(source_dir, img)
                ext = img.split('.')[-1]
                target_filepath = os.path.join(validation_target_dir, f'{c}_{d}_validation_{validation_id}.{ext}')
                shutil.copy(source_filepath, target_filepath)
                print(f'{source_filepath} -> {target_filepath}')
                validation_id += 1
                img_count += 1

except Exception as e:
    print('Error creating dataset structure: ', e)

print(f'Done! {img_count} copied successfully.')
