### Import packages

In [8]:
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets

import os
from random import shuffle
from PIL import Image
import h5py

### Parameters

In [9]:
MEDICAL_DATA = '/pio/lscratch/1/i279076/Medical/LGG-segmentation/'
TEST_SLICE = 0.3

MEDICAL_DATA_DEST = '/pio/lscratch/1/i279076/Medical/medical_v2.h5'

### Prepare basic dataset

In [17]:
patients = []

for patient in os.listdir(MEDICAL_DATA):
    patient_path = os.path.join(MEDICAL_DATA, patient)
    
    if os.path.isdir(patient_path):
        patients.append(patient_path)

shuffle(patients)
test_slice = int(len(patients) * TEST_SLICE)

test_patients, train_patients = patients[:test_slice], patients[test_slice:]

test_patients = [(patient, 'test') for patient in test_patients]
train_patients = [(patient, 'train') for patient in train_patients]

train_amount, test_amount = len(train_patients), len(test_patients)
all_amount = train_amount + test_amount
print('train: {:.2}% ({}), test: {:.2}% ({})'.format(train_amount / all_amount, train_amount, test_amount / all_amount, test_amount))

train: 0.7% (77), test: 0.3% (33)


In [18]:
patients_with_dest = test_patients + train_patients
files_with_dest = []

for i, (patient, dest) in enumerate(patients_with_dest):
    for filename in os.listdir(patient):
        if filename.endswith('_mask.tif'):
            mask_path = os.path.join(patient, filename)
            files_with_dest.append((mask_path, dest, i))

In [19]:
train_files = list(filter(lambda file: file[1] == 'train', files_with_dest))
test_files = list(filter(lambda file: file[1] == 'test', files_with_dest))

train_amount, test_amount = len(train_files), len(test_files)
all_amount = train_amount + test_amount

print('train: {:.2}% ({}), test: {:.2}% ({})'.format(train_amount / all_amount, train_amount, test_amount / all_amount, test_amount))

train: 0.7% (2767), test: 0.3% (1162)


In [20]:
train_data, test_data = [], []

for file, dest, i in files_with_dest:
    mask = np.array(Image.open(file))[:, :, None]
    
    image_path = file.replace('_mask', '')
    image = np.array(Image.open(image_path))
    
    destination = train_data if dest == 'train' else test_data
    destination.append((image, mask, i))

In [27]:
train_images, train_masks, train_patients = zip(*train_data)
test_images, test_masks, test_patients = zip(*test_data)

train_images, train_masks = np.array(train_images) / 255.0, np.array(train_masks) // 255
test_images, test_masks = np.array(test_images) / 255.0, np.array(test_masks) // 255

print('Train images shape: ', train_images.shape, 'masks shape: ', train_masks.shape)
print('Test images shape: ', test_images.shape, 'masks shape: ', test_masks.shape)

Train images shape:  (2767, 256, 256, 3) masks shape:  (2767, 256, 256, 1)
Test images shape:  (1162, 256, 256, 3) masks shape:  (1162, 256, 256, 1)


In [28]:
with h5py.File(MEDICAL_DATA_DEST, 'a') as f:
    f.create_dataset('train/images', data=train_images, compression="gzip")
    f.create_dataset('train/masks', data=train_masks, compression="gzip")
    f.create_dataset('train/patients', data=train_patients, compression="gzip")
    
    f.create_dataset('test/images', data=test_images, compression="gzip")
    f.create_dataset('test/masks', data=test_masks, compression="gzip")
    f.create_dataset('test/patients', data=test_patients, compression="gzip")