In [1]:
%run ./configure.ipynb


IMPORTS:
import pathlib
import pandas as pd
import shutil


EXPOSED CONSTANTS:
DATA_FILE_PATH:			data.xlsx
IMAGES_FOLDER_PATH:		images
ALLOWED_IMAGE_EXTENSIONS:	{'.PNG', '.png'}
SPLIT:				0.1
SPLITS_FOLDER_PATH:		splits
TEST_SUB_FOLDERS:		['good', 'bad']
VAL_SUB_FOLDERS:		['good', 'bad']
TRAIN_SUB_FOLDERS:		[]


EXPOSED FUNCTIONS:
read_data()
write_data(df)
test_get_folder_name(row)
val_get_folder_name(row)
train_get_folder_name(row)
test_can_include(row)
val_can_include(row)
train_can_include(row)
read_process_write(read_path, write_path)



In [2]:
import sklearn.utils
import math
import itertools

In [3]:
# Make splits directory
if SPLITS_FOLDER_PATH.is_dir():
    print('Deleting current splits directory.')
    shutil.rmtree(SPLITS_FOLDER_PATH)
print(f'Creating directory `{SPLITS_FOLDER_PATH.as_posix()}`')
SPLITS_FOLDER_PATH.mkdir()
# Make directories for test, val, and train
dir_paths = [pathlib.Path(SPLITS_FOLDER_PATH, d) for d in ['test', 'val', 'train']]
test_dir_path, val_dir_path, train_dir_path = dir_paths
for d in dir_paths:
    print(f'Creating directory `{d.as_posix()}`')
    d.mkdir()
# Make subdirectories within test, val, and train depending on configuration
for p in itertools.chain.from_iterable(
    [pathlib.Path(dir_path, sub_dir) for sub_dir in sub_dirs] 
    for dir_path, sub_dirs in 
    [(train_dir_path, TRAIN_SUB_FOLDERS), (val_dir_path, VAL_SUB_FOLDERS), (test_dir_path, TEST_SUB_FOLDERS)]):
    print(f'Creating directory `{p.as_posix()}`')
    p.mkdir()

Deleting current splits directory.
Creating directory `splits`
Creating directory `splits/test`
Creating directory `splits/val`
Creating directory `splits/train`
Creating directory `splits/val/good`
Creating directory `splits/val/bad`
Creating directory `splits/test/good`
Creating directory `splits/test/bad`


In [4]:
df = read_data()
df.head()

Unnamed: 0,id,file,label,bacteria level
0,1,Capture01.PNG,anomalous,low
1,2,Capture02.PNG,anomalous,low
2,3,Capture03.PNG,normal,none
3,4,Capture04.PNG,normal,none
4,1,Capture01.PNG,anomalous,low


In [5]:
# Compile all possible combinations of splits and their respective potential indices
signatures = [(index, (train_can_include(row), val_can_include(row), test_can_include(row)))
              for index, row in df.iterrows()]
indices_for = lambda signature: [index for index, sig in signatures if sig == signature]
train_indices = indices_for((True, False, False))
val_indices = indices_for((False, True, False))
test_indices = indices_for((False, False, True))
train_val_indices = indices_for((True, True, False))
train_test_indices = indices_for((True, False, True))
val_test_indices = indices_for((False, True, True))
train_val_test_indices = indices_for((True, True, True))
train = []
val = []
test = []
# Divide the indices into the train, val, and test buckets
for fractions, buckets, indices in [
    ([1], [train], train_indices),
    ([1], [val], val_indices),
    ([1], [test], test_indices),
    ([1 - SPLIT, SPLIT], [train, val], train_val_indices),
    ([1 - SPLIT, SPLIT], [train, test], train_test_indices),
    ([0.5, 0.5], [val, test], val_test_indices),
    ([1 - 2*SPLIT, SPLIT, SPLIT], [train, val, test], train_val_test_indices)]:
    amounts = [math.floor(len(indices) * f) for f in fractions]
    if sum(amounts) < len(indices):
        amounts[-1] += len(indices) - sum(amounts)
    shuffled = sklearn.utils.shuffle(indices, random_state=42)
    allocation = []
    for amount in amounts:
        allocation.append(shuffled[:amount])
        shuffled = shuffled[amount:]
    for allocated, bucket in zip(allocation, buckets):
        bucket.extend(allocated)
train[:10], val[:10], test[:10]

([2, 19, 14, 3, 21, 11, 26, 17, 7, 10],
 [0, 1, 9, 13, 16],
 [4, 8, 5, 20, 25, 12])

In [6]:
for indices, dir_path, get_sub_dir in [
    (train, train_dir_path, train_get_folder_name), 
    (val, val_dir_path, val_get_folder_name), 
    (test, test_dir_path, test_get_folder_name)]:
    for _, row in df.iloc[indices].iterrows():
        original_image_path = pathlib.Path(IMAGES_FOLDER_PATH, row['file'])
        processed_image_path = pathlib.Path(dir_path, get_sub_dir(row), row['file']) # or change the name
        read_process_write(original_image_path, processed_image_path)
print('Done.')

Done.
