In [None]:
import os
from pathlib import Path
import shutil
import numpy as np
from numpy import random as npr
import pandas as pd
from IPython.display import display

In [None]:
# Hyperparams
data_path_str = '../../data/original_data_set'
new_path_str = '../../data/split_set'
splits = [
    ('train', .8),
    ('val', .1),
    ('test', .1)
]

# seed
npr.seed(523324)

# build paths
save_path = Path(new_path_str).expanduser()
data_path = Path(data_path_str).expanduser()
type_dirs = [ f for f in data_path.iterdir() if f.is_dir() ]

# determine cumulative splits and validate split sizes
total = 0.
splits_total = []

for split_name, prop in splits:
    splits_total.append((split_name, total, total + prop))
    total += prop
    
assert abs(splits_total[-1][-1] - 1.) < 1e-10, 'Invalid split (does not sum to 1)'

# randomly split all models
split_mdirs = {split_name: [] for split_name, _ in splits}

def process_type(model_dirs, rand_idx, split_name):
    """
    Add all models in `model_dirs` with index in 
    `rand_idx` to split `split name`.
    """
    for src_idx in rand_idx:
        suffix = f'_{src_idx}'
        matches = [mdir for mdir in model_dirs if str(mdir)[-len(suffix):] == suffix]

        assert len(matches) == 1, f'Invalid source model name format, {len(matches)} matches found'
                
        src_path = matches[0]
        split_mdirs[split_name].append(src_path)
        
# iterate over each model type
for type_dir in type_dirs:
    # get all model dirs for type and randomly partition into splits
    model_dirs = [ f for f in type_dir.iterdir() if f.is_dir() ]
    rand_idx = npr.permutation(len(model_dirs))
    
    for split_name, start, stop in splits_total:
        process_type(model_dirs, 
                     rand_idx[int(len(rand_idx) * start): int(len(rand_idx) * stop)],
                     split_name)

# Randomly reorder models in each split, copy the model to the new directory, and
#    record the mapping
mapping = []
        
for split_name, model_dirs in split_mdirs.items():
    rand_idx = npr.permutation(len(model_dirs))
    
    for dst_idx, src_idx in enumerate(rand_idx):
        src_path = model_dirs[src_idx]
        dst_path = save_path / split_name / f'model_{dst_idx}'
        shutil.copytree(src_path, dst_path)
        mapping.append((split_name, dst_idx, os.path.join(*src_path.parts[-2:])))

# save the mapping to a CSV
df = pd.DataFrame(mapping, columns=['Split Name', 'Index', 'Source File'])
df.to_csv('mapping.csv', index=False)

In [None]:
# Initial Dataset Validation

print('Proportion by Split:')
display(df.groupby('Split Name')['Index'].count() / len(df))

print('\nUnique Source Files by Split:')
display(df.groupby('Split Name')['Source File'].nunique())

assert df['Source File'].unique().shape[0] == len(df), 'Every source file is not unique'