In [None]:
'''
Why do we need HDF5 file - Its an efficeint way to handle Image processing while model building.
It helps in reducing out of meory error. 

This notebook basically explains- 
A. How to generate HDF5 file from Images.
B. How to extract files from HDF5 file


'''

In [1]:
from random import shuffle
import glob
import numpy as np
import h5py
import cv2

In [2]:
hdf5_path = 'Audio_Classification_Train_HDF5_0423_v1_.hdf5'  # file path for the created .hdf5 file
train_path = '/dataset_1/train/*/*.png' # the original data path
shuffle_data = True  # shuffle the addresses

In [4]:
# First part - read the images and store into variable
addrs = glob.glob(train_path)
labels = [0 if 'good' in addr else 1 for addr in addrs]

print(len(labels))

11203


In [5]:
if shuffle_data:
    c = list(zip(addrs, labels))  # use zip() to bind the images and labels together
    shuffle(c)

    (addrs, labels) = zip(*c)  # *c is used to separate all the tuples in the list c,
    # "addrs" then contains all the shuffled paths and
    # "labels" contains all the shuffled labels.

In [7]:
# Second Part - Split between train and test data

train_addrs = addrs[0:int(0.999 * len(addrs))]
train_labels = labels[0:int(0.999 * len(labels))]

test_addrs = addrs[int(0.999 * len(addrs)):]
test_labels = labels[int(0.999 * len(labels)):]

print(len(train_addrs))
print(len(test_addrs))

11191
12


In [8]:
# 3rd Part - Initialize HDF5 file object.

train_shape = (len(train_addrs), 224, 224, 3)
test_shape = (len(test_addrs), 224, 224, 3)

# open a hdf5 file and create earrays
f = h5py.File(hdf5_path, mode='w')

# PIL.Image: the pixels range is 0-255,dtype is uint.
# matplotlib: the pixels range is 0-1,dtype is float.
f.create_dataset("train_img", train_shape, np.uint8)
f.create_dataset("test_img", test_shape, np.uint8)

# the ".create_dataset" object is like a dictionary, the "train_labels" is the key.
f.create_dataset("train_labels", (len(train_addrs),), np.uint8)
f["train_labels"][...] = train_labels

f.create_dataset("test_labels", (len(test_addrs),), np.uint8)
f["test_labels"][...] = test_labels

In [9]:
######################## 4th part: write the images into HDF5 object #########################

# loop over train paths
for i in range(len(train_addrs)):

    if i % 1000 == 0 and i > 1:
        print('Train data: {}/{}'.format(i, len(train_addrs)))
    try:
        addr = train_addrs[i]
        img = cv2.imread(addr)
        img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_CUBIC)  # resize to (128,128)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # cv2 load images as BGR, convert it to RGB
        f["train_img"][i, ...] = img[None]
    except Exception as e:
        print('Error in: ', i)

# loop over test paths
for i in range(len(test_addrs)):

    if i % 1000 == 0 and i > 1:
        print('Test data: {}/{}'.format(i, len(test_addrs)))
    try:
        addr = test_addrs[i]
        img = cv2.imread(addr)
        img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_CUBIC)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        f["test_img"][i, ...] = img[None]
    except Exception as e:
        print('Error in: ', i)

f.close()

Train data: 1000/11191
Train data: 2000/11191
Train data: 3000/11191
Train data: 4000/11191
Train data: 5000/11191
Train data: 6000/11191
Train data: 7000/11191
Train data: 8000/11191
Train data: 9000/11191
Train data: 10000/11191
Train data: 11000/11191


In [16]:
# 5th part - Read HDF5 file and extract content of it

dataset = h5py.File(hdf5_path, "r")
train_labels=np.array(dataset["train_labels"])
train_labels = train_labels.reshape((len(train_labels),-1))
print((len(train_labels)))

11191


In [17]:
img=(dataset['train_img'])[0]
img=img/255.
img.shape

(224, 224, 3)