# Build Dataset Structure
Copies all images from different dataset folders in a dataset structure with _test_, _train_ and _validation_ subfolders according to a given ratio (80/10/10 by default). See example below.  

Images naming convention:  ```{shape}_{dataset}_{set}_{id}.{extension}```

In [None]:
# Source Datasets -------------------------------------------------------------
'''
[d1]
    [rock]
        name1.ext, name2.ext, ...
    [paper]
        name1.ext, name2.ext, ...
    [scissors]
        name1.ext, name2.ext, ...

[d2]
    [rock]
        name1.ext, name2.ext, ...
    [paper]
        name1.ext, name2.ext, ...
    [scissors]
        name1.ext, name2.ext, ...
'''

# Target Structure ------------------------------------------------------------
'''

[train]
    [rock]
        rock_d1_train_1.ext, rock_d2_train_2.ext, ...
    [paper]
        paper_d2_train_1.ext, paper_d1_train_2.ext, ...
    [scissors]
        scissors_d1_train_1.ext, scissors_d2_train_2.ext, ...

[test]
    [rock]
        rock_d1_test_1.ext, rock_d1_test_2.ext, ...
    [paper]
        paper_d2_test_1.ext, paper_d2_test_2.ext, ...
    [scissors]
        scissors_d1_test_1.ext, scissors_d2_test_2.ext, ...

[validation]
    [rock]
        rock_d2_validation_1.ext, rock_d1_validation_2.ext, ...
    [paper]
        paper_d2_validation_1.ext, paper_d2_validation_2.ext, ...
    [scissors]
        scissors_d1_validation_1.ext, scissors_d1_validation_2.ext, ...
'''

In [None]:
import os
import shutil
import random

root_path = 'dataset'
dataset_paths = []  # dataset folders to use, all if empty
split_ratio = [0.8, 0.1, 0.1]  # train / test / validation ratios
classes = ['rock', 'paper', 'scissors']
steps = ['train', 'test', 'validation']
img_ext = ['jpg', 'jpeg', 'png']

# define ids to be used in the new dataset img names, to avoid overwriting
train_id = 0
test_id = 0
validation_id = 0
train_img_count = 0
test_img_count = 0
validation_img_count = 0

try:
    # list all directories in root_path if dataset_paths is empty
    if len(dataset_paths) == 0:
        dataset_paths = [d for d in os.listdir(root_path) if os.path.isdir(os.path.join(root_path, d))]

    print(f'Loading {len(dataset_paths)} datasets: ', dataset_paths)

    # build structure
    for s in steps:
        for c in classes:
            os.makedirs(os.path.join(root_path, s, c))

    # copy files
    for d in dataset_paths:

        for c in classes:

            # list imaged in directory
            source_dir = os.path.join(root_path, d, c)
            img_list = [img for img in os.listdir(source_dir) if img.split('.')[-1].lower() in img_ext]
            img_len = len(img_list)

            # create 3 random lists of images for train, test and validation based on split ration
            train_img_len = int(img_len * split_ratio[0])
            train_img_list = random.sample(img_list, train_img_len)
            test_img_len = int(img_len * split_ratio[1])
            test_img_list = random.sample([_ for _ in img_list if _ not in train_img_list], test_img_len)
            validation_img_list = [_ for _ in img_list if _ not in train_img_list and _ not in test_img_list]

            # copy train images
            train_target_dir = os.path.join(root_path, 'train', c)
            for img in train_img_list:
                source_filepath = os.path.join(source_dir, img)
                ext = img.split('.')[-1]
                target_filepath = os.path.join(train_target_dir, f'{c}_{d}_train_{train_id}.{ext}')
                shutil.copy(source_filepath, target_filepath)
                to_print = f'{source_filepath} -> {target_filepath}'
                print(f'{to_print:<200}', end='\r', flush=True)
                train_id += 1
                train_img_count += 1

            # copy test images
            test_target_dir = os.path.join(root_path, 'test', c)
            for img in test_img_list:
                source_filepath = os.path.join(source_dir, img)
                ext = img.split('.')[-1]
                target_filepath = os.path.join(test_target_dir, f'{c}_{d}_test_{test_id}.{ext}')
                shutil.copy(source_filepath, target_filepath)
                to_print = f'{source_filepath} -> {target_filepath}'
                print(f'{to_print:<200}', end='\r', flush=True)
                test_id += 1
                test_img_count += 1

            # copy validation images
            validation_target_dir = os.path.join(root_path, 'validation', c)
            for img in validation_img_list:
                source_filepath = os.path.join(source_dir, img)
                ext = img.split('.')[-1]
                target_filepath = os.path.join(validation_target_dir, f'{c}_{d}_validation_{validation_id}.{ext}')
                shutil.copy(source_filepath, target_filepath)
                to_print = f'{source_filepath} -> {target_filepath}'
                print(f'{to_print:<200}', end='\r', flush=True)
                validation_id += 1
                validation_img_count += 1

except Exception as e:
    print('\nError creating dataset structure: ', e)

print(f'\nImages copied successfully using split ratio {split_ratio}.')
print(f'Train: {train_img_count} | Test: {test_img_count} | Validation: {validation_img_count} | Total: {train_img_count + test_img_count + validation_img_count}')
