In [None]:
import numpy as np
import os
import joblib
import math
from PIL import Image

### Create numpy arrays from raw images

In [None]:
datasets = [
    'datasets/abide_imgs',
    'datasets/oasis_imgs'
]

img_size = (128, 128)
Xall, Yall = np.array([]), np.array([])
number_of_images_total = 0

for dataset in datasets:
    print("Reading data for dataset {}".format(dataset))
    total_images_for_dataset = 0
    dataset_folder = os.path.join('.', dataset)
    dataset_files = sorted(os.listdir(dataset_folder))
    dataset_size = len(dataset_files)
    for i in range(0, dataset_size, 2):
        number_of_images_total += 1
        total_images_for_dataset += 1
        full_image = dataset_files[i]
        segmented_image = dataset_files[i+1]
        if 'abide' in dataset_folder:
            full_image, segmented_image = segmented_image, full_image
        filename = os.path.splitext(full_image)[0]

        # create np array image of full image
        tiff_file_path = os.path.join(dataset_folder, full_image)
        tiff_image = Image.open(tiff_file_path, 'r').convert('L').resize(img_size)
        full_image = np.array(tiff_image)
        Xall = np.append(Xall, full_image)

        # create np array image of segmented image
        tiff_file_path = os.path.join(dataset_folder, segmented_image)
        tiff_image = Image.open(tiff_file_path, 'r').convert('L').resize(img_size)
        segmented_image = np.array(tiff_image)
        segmented_image[segmented_image != 255] = 1.0
        segmented_image[segmented_image == 255] = 0.0
        Yall = np.append(Yall, segmented_image)

        if number_of_images_total % 1000 == 0:
            print("{} / {} processed!".format(total_images_for_dataset, dataset_size // 2))
    print("Dataset {} finished!".format(dataset))

Xall = Xall.reshape(number_of_images_total, *img_size, 1)
Yall = Yall.reshape(number_of_images_total, *img_size, 1)

print("Generated dataset shapes. input: {} ; output: {}".format(Xall.shape, Yall.shape))

joblib.dump((Xall, Yall), 'datasets/all.pkl')

### Separate train, val and test data and save them on disk

In [None]:
Xall, Yall = joblib.load('datasets/all.pkl')
print(Xall.shape)
print(Yall.shape)

training_percentage = 0.7
validation_percentage = 0.1

training_set_index = math.floor(Xall.shape[0]*training_percentage)
validation_set_index = math.floor(Xall.shape[0]*validation_percentage) + training_set_index

# shuffling before training-validation-test slicing
ids = np.arange(Xall.shape[0])
np.random.shuffle(ids) # shuffle images to avoid bias in training
Xall, Yall = Xall[ids], Yall[ids]

print(Xall.shape)
print(Yall.shape)

Xte, yte = Xall[validation_set_index:,:], Yall[validation_set_index:] # X and y for testing
# test set is saved on disk. It should NOT be modified. All model evaluations MUST target the same test set.
joblib.dump((Xte, yte, {'test_percentage': 1 - training_percentage - validation_percentage }), 'datasets/test.pkl')

X_remaining, y_remaining = Xall[:validation_set_index,:], Yall[:validation_set_index] # X and y for training and validation
# test and val set are saved on disk. It can be loaded after and be shuffled, cross validated, etc.
config = {
            'train_percentage': training_percentage,
            'training_set_index': training_set_index,
            'val_percentage': validation_percentage,
            'validation_set_index': validation_set_index
         }
joblib.dump((X_remaining, y_remaining, config), 'datasets/train-and-val.pkl')

### Data augmentation

In [None]:
X_remaining, Y_remaining, remaining_dataset_desc = joblib.load('datasets/train-and-val.pkl')
training_set_index = remaining_dataset_desc['training_set_index']
validation_set_index = remaining_dataset_desc['validation_set_index']

Xtr, ytr = X_remaining[:training_set_index,:], Y_remaining[:training_set_index] # X and y for training
Xva, yva = X_remaining[training_set_index:validation_set_index,:], Y_remaining[training_set_index:validation_set_index] # X and y for validation

In [None]:
augmented_train_dataset_save_file = 'datasets/train-augmented-{}.pkl'

txtyrange = range(-1, 1, 1) # translation range for x and y directions
loat = [ (tx, ty) for tx in txtyrange for ty in txtyrange ] # list of accepted translations
loaa = list(range(-1, 1, 1))
foia = len(loat) * len(loaa) # factor of image augmentation
print(foia)

total_imgs = Xtr.shape[0]
increment = 0

print(total_imgs*foia)

for i in range(total_imgs):
    x = Xtr[i]
    y = ytr[i]
    for (tx, ty) in loat:
        input_array = x.reshape(x.shape[0], x.shape[1])
        output_array = y.reshape(y.shape[0], y.shape[1])

        input_image = Image.fromarray(input_array)
        input_image = input_image.transform(input_image.size, Image.AFFINE, (1, 0, tx, 0, 1, ty)) # translated full image

        output_image = Image.fromarray(output_array)
        output_image = output_image.transform(output_image.size, Image.AFFINE, (1, 0, tx, 0, 1, ty)) # translated full image

        for a in loaa:
            increment += 1

            if increment % 1000 == 0:
                print("Processed {}/{}".format(increment, total_imgs*foia))

            input_image = input_image.rotate(a, resample=Image.BICUBIC) # rotated trcimg
            input_array_augmented = np.array(input_image) # array with pixel values
            Xtr = np.append(Xtr, input_array_augmented).reshape(total_imgs+increment, x.shape[0], x.shape[1], x.shape[2])

            output_image = output_image.rotate(a, resample=Image.BICUBIC) # rotated trcimg
            output_array_augmented = np.array(output_image) # array with pixel values
            ytr = np.append(ytr, output_array_augmented).reshape(total_imgs+increment, y.shape[0], y.shape[1], y.shape[2])

joblib.dump((Xtr, ytr), augmented_train_dataset_save_file.format(total_imgs*foia))