In [1]:
import h5py
import numpy as np

## Some operation

`h5py.File` work like a dictionary

`Group` work like a dictionary

`dataset` work like a numpy array

### Create file and store data

In [2]:
k = 10
n = 100
some_data = np.random.rand(k, n, n)
with h5py.File('myh5.hdf5', 'w') as f:
    dset = f.create_dataset('my_dataset', (k, n, n))
    dset[:] = some_data

### Load file and read data

In [3]:
with h5py.File('myh5.hdf5', 'r') as f:
    print(f['my_dataset'][:].shape) # Read all the data
    print(f['my_dataset'][:5].shape) # Read first 5 of (n,n)
    print(f['my_dataset'][0, 1, 2]) # Read specific number

(10, 100, 100)
(5, 100, 100)
0.27462065


### Group organization

Create groups in the following archtecture

```
f -
  |-Group1
    |-dataset1
  |-Group2
    |-dataset2
    |-Subgroup1
      |-dataset3
    |-Subgroup2
      |-
  |-Group3
    |-dataset4
```
“HDF” stands for “Hierarchical Data Format”. Every object in an HDF5 file has a name, and they’re arranged in a POSIX-style hierarchy with `/`-separators


In [4]:
f = h5py.File('myh5.hdf5', 'w')
# Create group
grp1 = f.create_group('Group1')
# Create group and sub group
grp2 = f.create_group('Group2')
grp21 = grp2.create_group('Subgroup1')
# Create subgroup directly using POSIX style hierarchy
grp22 = f.create_group('/Group2/Subgroup2')
# Create datasdet use group object
dset1 = grp1.create_dataset("dataset1", (50,), dtype='f')
# Create use POSIX style
dset2 = f.create_dataset("/Group2/dataset2", (20,), dtype='i')
dset3 = grp2.create_dataset("./Subgroup1/dataset3", (10,), dtype='f')
# Create a dataset along with all parent group
dset4 = f.create_dataset('/Group3/dataset4', (10,), dtype='f')
f.close()

Access group

In [5]:
f = h5py.File('myh5.hdf5', 'r')
# The name path of the object
print(f'{f.name = }')
# Find all the keys
print(f'{[key for key in f.keys()] = }')
# Access dierclty from POSIX like structure
print(f'{f["/Group1/dataset1"] = }')
# Access through dictionary key
grp2 = f['Group2']
print(f'{grp2["dataset2"] = }')
print(f'{grp2["dataset2"].name = }')
# Access all object in the group
print(f'{[item.name for key, item in grp2.items()] = }')
# Use get method
print(f'{grp2.get("Subgroup2") = }')
f.close()

f.name = '/'
[key for key in f.keys()] = ['Group1', 'Group2', 'Group3']
f["/Group1/dataset1"] = <HDF5 dataset "dataset1": shape (50,), type "<f4">
grp2["dataset2"] = <HDF5 dataset "dataset2": shape (20,), type "<i4">
grp2["dataset2"].name = '/Group2/dataset2'
[item.name for key, item in grp2.items()] = ['/Group2/Subgroup1', '/Group2/Subgroup2', '/Group2/dataset2']
grp2.get("Subgroup2") = <HDF5 group "/Group2/Subgroup2" (0 members)>


Iterate through groups, and do something to it

In [6]:
func1 = lambda x: print(x)
func2 = lambda x, y: print(f'{x}: {y}')
f = h5py.File('myh5.hdf5', 'r')
print('===visit===')
f.visit(func1)
print('===visit_item===')
grp2 = f['Group2']
grp2.visititems(func2)
f.close()

===visit===
Group1
Group1/dataset1
Group2
Group2/Subgroup1
Group2/Subgroup1/dataset3
Group2/Subgroup2
Group2/dataset2
Group3
Group3/dataset4
===visit_item===
Subgroup1: <HDF5 group "/Group2/Subgroup1" (1 members)>
Subgroup1/dataset3: <HDF5 dataset "dataset3": shape (10,), type "<f4">
Subgroup2: <HDF5 group "/Group2/Subgroup2" (0 members)>
dataset2: <HDF5 dataset "dataset2": shape (20,), type "<i4">


### Attribute

Attribute is useful for store metadata alongside with your dataset!

In [7]:
k = 10
n = 100
with h5py.File('myh5.hdf5', 'w') as f:
    dset = f.create_dataset('my_dataset', (k, n, n))
    dset.attrs['resolution'] = (n,n)
    print(f'{dset.attrs["resolution"] = }')
    
    grp1 = f.create_group('group1')
    grp1.attrs['location'] = 'Stanford'
    print(f'{grp1.attrs["location"] = }')

dset.attrs["resolution"] = array([100, 100])
grp1.attrs["location"] = 'Stanford'


### Chunked storage

By default, HDF5 dataset is store contiguous on disk in C order. You can enable chunk storage which will store in specific chunk on disk, this will also enable to change size of dataset on the fly. 

Note, when reading a chunked dataset, if any element in a chunk is accessed, the entire is read from disk. Therefore, it is idea to set chunk to one unit data (one image, one time step etc...)

**Chunking has performance implication.**

In [8]:
with h5py.File('myh5.hdf5', 'w') as f:
    # Specific chunk
    dset = f.create_dataset('chunked', (5, 100, 100), chunks = (1, 100, 100))
    # Auto chunking
    dset = f.create_dataset('autochunk', (5, 100, 100), chunks = True)

### Resizable dataset

When create a dataset, you can pass `maxshape` to make the dataset resiable

In [9]:
some_data = np.random.rand(100, 100)
with h5py.File('myh5.hdf5', 'w') as f:
    # Specific upper limit
    max20 = f.create_dataset('max20', (1, 100, 100), maxshape = (20, 100, 100))
    # No upper limit
    unlimited = f.create_dataset('unlimited', (1, 100, 100), maxshape = (None, 100, 100))
    # resize operation
    max20[0] = some_data
    print(f'Before resize: {max20.shape = }')
    max20.resize(15, axis=0)
    print(f'After resize: {max20.shape = }')
    max20[14] = some_data
    

Before resize: max20.shape = (1, 100, 100)
After resize: max20.shape = (15, 100, 100)


### Optimize storage

You can futher optimize storage using compression. Note using compression may bring quite significant performance impact. 

The compression handle purely through h5py, the data will compress when store into h5py and decompress when read. 

In [10]:
import time

n = 2000
some_data = np.random.rand(n, n)
with h5py.File('myh5.hdf5', 'w') as f:
    # GZIP compression: lossless, GOOD compression, MODERATE speed
    # compression option: 0-9
    # default compression level 4:
    gzip = f.create_dataset('gzip', (n, n), compression='gzip')
    start = time.time()
    gzip[:] = some_data
    print(f'WRITE with gzip=4: \t\t\t{time.time()-start}')

    # max compression level:
    gzip_max = f.create_dataset('gzip_max', (n, n), compression='gzip', compression_opts=9)
    start = time.time()
    gzip_max[:] = some_data
    print(f'WRITE with gzip=9: \t\t\t{time.time()-start}')

    # LZF compression: lossless, LOW to MODERATE compressinom, VERY FAST speed
    # no compression option
    lzf = f.create_dataset('lzf', (n, n), compression='lzf')
    start = time.time()
    lzf[:] = some_data
    print(f'WRITE with lzf: \t\t\t\t{time.time()-start}')

    # write uncompress:
    uncomp = f.create_dataset('uncomp', (n, n))
    start = time.time()
    uncomp[:] = some_data
    print(f'WRITE with no compression: \t\t{time.time()-start}')

    # You may be able to further improve the compression ratio by shuffling data within the chunk, little speed penalty losless
    gzip_shuffle = f.create_dataset('gzip_shuffle', (n, n), compression='gzip', shuffle=True)
    lzf_shuffle = f.create_dataset('lzf_shuffle', (n, n), compression='lzf', shuffle=True)

    # Fletcher 32: add a chuecksum to each chunk to detect data corruption, it will fail with an error when read corrupted chunks
    gzip_fl32 = f.create_dataset('gzip_fl32', (n, n), compression='gzip', fletcher32=True)
    lzf_fl32 = f.create_dataset('lzf_fl32', (n, n), compression='lzf', fletcher32=True)

WRITE with gzip=4: 			0.45760011672973633
WRITE with gzip=9: 			0.4559488296508789
WRITE with lzf: 				0.10457086563110352
WRITE with no compression: 		0.019588947296142578


In [12]:
with h5py.File('myh5.hdf5', 'r') as f:
    start = time.time()
    _ = f['gzip'][:]
    print(f'READ with gzip=4: \t\t\t\t{time.time()-start}')

    start = time.time()
    _ = f['gzip_max'][:]
    print(f'READ with gzip=9: \t\t\t\t{time.time()-start}')

    start = time.time()
    _ = f['lzf'][:]
    print(f'READ with lzf: \t\t\t\t\t{time.time()-start}')

    start = time.time()
    _ = f['uncomp'][:]
    print(f'READ with no compression: \t\t{time.time()-start}')

READ with gzip=4: 				0.11252164840698242
READ with gzip=9: 				0.11008644104003906
READ with lzf: 					0.014460086822509766
READ with no compression: 		0.006185054779052734


## Some example

The following code are not throughly tested, use as a reference for how you could convert your dataset into hdf5 file.

### x only

For unsupervised training, when you only have one type of data. It use dynamic loading to increase size as data is added. 

In [None]:
ROOT = SOME/PATH/TO/DIR

def loader (path):
    # Loader should return a numpy array that fit with chunks size
    return np.load(str(path))

h5_path = Path(ROOT)/'test.h5'
data_dir = Path(ROOT)/'data'

ext = 'npy'
init_shape = (1,12,128,128)
maxshape = (None,12,128,128) # None means no upper limit
chunks = (1,12,128,128) # Usually set the chunk to how much data you will load at a time (like 1 image)
dataset_name = 'image'

with h5py.File(str(h5_path), 'w') as f:
    dset = f.create_dataset(dataset_name, init_shape, maxshape=maxshape, chunks=chunks)
    pbar = tqdm(list(data_dir.glob(f'*.{ext}')))
    for i, path in enumerate(pbar):
        x = loader(path)
        dset.resize(i+1, axis=0)
        dset[i,...] = x

### (x,y)

For supervised task, when you have both data and label. 

In [None]:
from PIL import Image

ROOT = SOME/PATH/TO/DIR

def im_loader (path):
    im = Image.open(path)
    return np.array(im)

def label_loader (label_file_path): # exmaple of label loader
    with open(label_file_path) as f:
        for line in f:
            yield line.strip()

data_dir = Path(ROOT)/'image'
label_path = Path(ROOT)/'label.txt'

ext = 'jpg'
img_shape = (1,128,128,3)
img_maxshape = (None,128,128,3) 
img_chunks = (1,128,128,3) 

label_shape = (1)
label_maxshape = (None)

label_gen = label_loader(label_path)

with h5py.File(str(h5_path), 'w') as f:
    im_dset = f.create_dataset('image', img_shape, maxshape=img_maxshape, chunks=img_chunks)
    l_dset = f.create_dataset('label', label_shape, maxshape=label_maxshape)
    pbar = tqdm(list(data_dir.glob(f'*.{ext}')))
    for i, path in enumerate(pbar):
        x = loader(path)
        y = next(label_gen)
        im_dset.resize(i+1, axis=0)
        im_dset[i,...] = x
        l_dset.resize(i+1, axis=0)
        l_dset[i] = y

### Dataloader

Exmaple of dataloader when using hdf5 file

In [None]:
class H5Dataset(torch.utils.data.Dataset):
    # ... some other methods ...  
    
    def __getitem__ (self, index):
        with h5py.File(str(self.data_path), 'r') as f:
            dset = f['image']
            im_np = dset[index, ...]
        return self.transforms(im_np)