In [1]:
from random import shuffle
import glob

shuffle_data = True  # shuffle the addresses before saving
hdf5_path = 'chest_xray.hdf5'  # address to where you want to save the hdf5 file
normal_images_path = 'images/normal/*.jpeg'
pneumonia_images_path = 'images/pneumonia/*.jpeg'

# read addresses and labels from the 'train' folder
normal_addrs = glob.glob(normal_images_path)
pneumonia_addrs = glob.glob(pneumonia_images_path)

addrs = normal_addrs + pneumonia_addrs
#print(addrs)

labels = [0 if 'normal' in addr else 1 for addr in addrs]  # 0 = normal, 1 = pneumonia
#print(labels)

# to shuffle data
if shuffle_data:
    c = list(zip(addrs, labels))
    shuffle(c)
    addrs, labels = zip(*c)

#print(labels)
# Divide the hata into 60% train, 20% validation, and 20% test
train_addrs = addrs[0:int(0.8*len(addrs))]
train_labels = labels[0:int(0.8*len(labels))]

test_addrs = addrs[int(0.8*len(addrs)):]
test_labels = labels[int(0.8*len(labels)):]

In [2]:
import numpy as np
import h5py

data_order = 'tf'  # 'th' for Theano, 'tf' for Tensorflow

# check the order of data and chose proper data shape to save images
if data_order == 'th':
    train_shape = (len(train_addrs), 3, 144, 144)
    test_shape = (len(test_addrs), 3, 144, 144)
elif data_order == 'tf':
    train_shape = (len(train_addrs), 144, 144, 3)
    test_shape = (len(test_addrs), 144, 144, 3)

# open a hdf5 file and create earrays
hdf5_file = h5py.File(hdf5_path, mode='w')

hdf5_file.create_dataset("train_img", train_shape, np.int8)
hdf5_file.create_dataset("test_img", test_shape, np.int8)

hdf5_file.create_dataset("train_mean", train_shape[1:], np.float32)

hdf5_file.create_dataset("train_labels", (len(train_addrs),), np.int8)
hdf5_file["train_labels"][...] = train_labels
hdf5_file.create_dataset("test_labels", (len(test_addrs),), np.int8)
hdf5_file["test_labels"][...] = test_labels

In [3]:
import cv2

# a numpy array to save the mean of the images
mean = np.zeros(train_shape[1:], np.float32)

# loop over train addresses
for i in range(len(train_addrs)):
    # print how many images are saved every 1000 images
    if i % 1000 == 0 and i > 1:
        print('Train data: {0}/{1}'.format(i, len(train_addrs)))

    # read an image and resize to (144, 144)
    # cv2 load images as BGR, convert it to RGB
    addr = train_addrs[i]
    img = cv2.imread(addr)
    img = cv2.resize(img, (144, 144), interpolation=cv2.INTER_CUBIC)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # add any image pre-processing here

    # if the data order is Theano, axis orders should change
    if data_order == 'th':
        img = np.rollaxis(img, 2)

    # save the image and calculate the mean so far
    hdf5_file["train_img"][i, ...] = img[None]
    mean += img / float(len(train_labels))

# loop over test addresses
for i in range(len(test_addrs)):
    # print how many images are saved every 1000 images
    if i % 1000 == 0 and i > 1:
        print('Test data: {0}/{1}'.format(i, len(test_addrs)))

    # read an image and resize to (144, 144)
    # cv2 load images as BGR, convert it to RGB
    addr = test_addrs[i]
    img = cv2.imread(addr)
    img = cv2.resize(img, (144, 144), interpolation=cv2.INTER_CUBIC)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # add any image pre-processing here

    # if the data order is Theano, axis orders should change
    if data_order == 'th':
        img = np.rollaxis(img, 2)

    # save the image
    hdf5_file["test_img"][i, ...] = img[None]

# save the mean and close the hdf5 file
hdf5_file["train_mean"][...] = mean
hdf5_file.close()

Train data: 1000/4684
Train data: 2000/4684
Train data: 3000/4684
Train data: 4000/4684
Test data: 1000/1172


import h5py
import numpy as np

hdf5_path = 'chest_xray.hdf5'
subtract_mean = False

# open the hdf5 file
hdf5_file = h5py.File(hdf5_path, "r")

# subtract the training mean
if subtract_mean:
    mm = hdf5_file["train_mean"][0, ...]
    mm = mm[np.newaxis, ...]

# Total number of samples
data_num = hdf5_file["train_img"].shape[0]

from random import shuffle
from math import ceil
import matplotlib.pyplot as plt

batch_size = 100
nb_class = 2

# create list of batches to shuffle the data
batches_list = list(range(int(ceil(float(data_num) / batch_size))))
shuffle(batches_list)

# loop over batches
for n, i in enumerate(batches_list):
    i_s = i * batch_size  # index of the first image in this batch
    i_e = min([(i + 1) * batch_size, data_num])  # index of the last image in this batch

    # read batch images and remove training mean
    images = hdf5_file["train_img"][i_s:i_e, ...]
    if subtract_mean:
        images -= mm

    # read labels and convert to one hot encoding
    labels = hdf5_file["train_labels"][i_s:i_e]
    labels_one_hot = np.zeros((batch_size, nb_class))
    labels_one_hot[np.arange(batch_size), labels] = 1

    print(n+1, '/', len(batches_list))

    print(labels[0], labels_one_hot[0, :])
    plt.imshow(images[0])
    plt.show()

    if n == 5:  # break after 5 batches
        break

hdf5_file.close()