In [1]:
import h5py
import os

class HDF5DatasetWriter:
    """
    The constructor to HDF5DatasetWriter accepts four parameters, two of which are optional.
The dims parameter controls the dimension or shape of the data we will be storing in the dataset. Think of dims as the .shape of a NumPy array. If we were storing the (flattened) raw pixel intensities of the 28 × 28 = 784 MNIST dataset, then dims=(70000, 784) as there are 70,000 examples in MNIST, each with a dimensionality of 784. If we wanted to store the raw CIFAR-10 images, then we would set dims=(60000, 32, 32, 3) as there are 60,000 total images in the CIFAR-10 dataset, each represented by a 32 × 32 × 3 RGB image.

In the context of transfer learning and feature extraction, we’ll be using the VGG16 architecture and taking the outputs after the final POOL layer. The output of the final POOL layer is 512 × 7 × 7 which, when flattened, yields a feature vector of length 25,088. Therefore, when using VGG16 for feature extraction, we’ll set dims=(N, 25088) where N is the total number of images in our dataset.

The next parameter to the HDF5DatasetWriter constructor is the outputPath – this is the path to where our output HDF5 file will be stored on disk. The optional dataKey is the name of the dataset that will store the data our algorithm will learn from. We default this value to "images", since in most cases we’ll be storing raw images in HDF5 format. However, for this example, when we instantiate the HDF5DatasetWriter we’ll set dataKey="features" to indicate that we are storing features extracted from a CNN in the file.

Finally, bufSize controls the size of our in-memory buffer, which we default to 1,000 feature vectors/images. Once we reach bufSize, we’ll flush the buffer to the HDF5 dataset.

    """
    def __init__(self, dims, outputPath, dataKey="images", bufSize=1000):
        # check to see if the output path exists, and if so, raise an exception
        if os.path.exists(outputPath):
            raise ValueError("The supplied 'outputPath' already "
                             "exists and cannot be overwritten. Manually delete "
                             "the file before continuing. ", outputPath)
            
        # In general, this will open the HDF5 database for writing 
        # and create two datasets:
        # one to store the images/features and another to store the
        # class labels
        
        # opens the HDF5 file for writing using the supplied outputPath
        self.db = h5py.File(outputPath, "w")
        
        # create dataset with the dataKey name and supplied dims
        self.data = self.db.create_dataset(dataKey, dims, dtype="float")
        # create dataset to store the class labels
        self.labels = self.db.create_dataset("labels", (dims[0],), dtype="int")
        
        # store the buffer size, then initialize the buffer itself
        # along with the index into the datasets
        self.bufSize = bufSize
        self.buffer = {"data": [], "labels": []}
        self.idx = 0
        
    def add(self, rows, labels):
        # add the rows and labels to the buffer
        self.buffer["data"].extend(rows)
        self.buffer["labels"].extend(labels)
        
        # check to see if the buffer needs to be flushed to disk
        if len(self.buffer["data"]) >= self.bufSize:
            self.flush()
        
    def flush(self):
        # in general, this will write the buffers to disk then reset the buffer
        
        # determine the next available row in the matrix
        i = self.idx + len(self.buffer["data"])
        # apply slicing to store the data
        self.data[self.idx:i] = self.buffer["data"]
        # apply slicing to storethe label
        self.labels[self.idx:i] = self.buffer["labels"]
        self.idx = i
        # reset the buffer
        self.buffer = {"data": [], "labels": []}
        
    def storeClassLabels(self, classLabels):
        # store the raw string names of the class labels in a separate dataset
        dt = h5py.special_dtype(vlen=bytes)
        labelSet = self.db.create_dataset("label_names", (len(classLabels),), dtype=dt)
        labelSet[:] = classLabels
        
    def close(self):
        # check to see if there are any other entries in the buffer
        # that need to be flushed to disk
        if len(self.buffer["data"]) > 0:
               self.flush()
               
        # close the dataset
        self.db.close()

The constructor to HDF5DatasetWriter accepts four parameters, two of which are optional.
The dims parameter controls the dimension or shape of the data we will be storing in the dataset. Think of dims as the .shape of a NumPy array. If we were storing the (flattened) raw pixel intensities of the 28 × 28 = 784 MNIST dataset, then dims=(70000, 784) as there are 70,000 examples in MNIST, each with a dimensionality of 784. If we wanted to store the raw CIFAR-10 images, then we would set dims=(60000, 32, 32, 3) as there are 60,000 total images in the CIFAR-10 dataset, each represented by a 32 × 32 × 3 RGB image.

In [189]:
import h5py
import os
import numpy as np

In [209]:
dims = [3000,200]
outputPath = "test/5.hdf5"
dataKey = "X"
buffSize = 1000

In [210]:
db = h5py.File(outputPath, "w")
predictor = db.create_dataset("predictor", dims, dtype=float)
response = db.create_dataset("response", (dims[0],1), dtype=int)
buffSize = buffSize 
buffer = {"X" :[], "y":[]}

In [211]:
# input arrays to the buffer

buffer["X"].extend(np.ones((5,200)))
buffer["y"].extend(np.ones((5,1)))

In [212]:
buffer["X"].extend(np.ones((10,200))*2)
buffer["y"].extend(np.ones((10,1))*2)

In [213]:
len(buffer["X"])

15

In [214]:
# flush buffer into the hdf5

predictor[0:len(buffer["X"])] = buffer["X"]
response[0:len(buffer["y"])] = buffer["y"]

In [215]:
db.close()

In [145]:
y

[1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]

In [216]:
read = h5py.File(outputPath, 'r')

In [217]:
list(read.keys())

['predictor', 'response']

In [219]:
xe = read["predictor"]

In [220]:
ye = read["response"]

In [221]:
xe

<HDF5 dataset "predictor": shape (3000, 200), type "<f8">

In [222]:
xe.shape

(3000, 200)

In [223]:
ye.shape

(3000, 1)

In [224]:
xe

<HDF5 dataset "predictor": shape (3000, 200), type "<f8">

In [233]:
xe[0]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [232]:
ye[6]

array([2])