In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
import h5py
import numpy as np
import open3d as o3d

from dotenv import load_dotenv, find_dotenv
from pathlib import Path

# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()
project_dir = Path(dotenv_path).parent

# load up the entries as environment variables
load_dotenv(dotenv_path)

True

# Load image

In [None]:
raw_data_dir = project_dir/'data/raw/render_results'

syms_dirs = list(raw_data_dir.glob('simulacao*'))

# Create HDF5

In [3]:
sym_dir = np.random.choice(syms_dirs)
img_fpath = np.random.choice(list(sym_dir.glob('*.png')))
img_fpath

PosixPath('/home/ctc_das/Desktop/part_counting/data/raw/render_results/simulacao245/simulacao245_0032.png')

In [4]:
img = o3d.io.read_image(str(img_fpath))
img

Image of size 512x512, with 4 channels.
Use numpy.asarray to access buffer data.

In [42]:
h5_fpath = project_dir/'data/interim/single_render.hdf5'

# TODO: explore chunking (see https://docs.h5py.org/en/stable/high/file.html#chunk-cache
# and https://portal.hdfgroup.org/display/HDF5/Chunking+in+HDF5)
# it may be useful for keeping the hdf5 in HDD
h = h5py.File(str(h5_fpath), "w")
h

<HDF5 file "single_render.hdf5" (mode r+)>

In [43]:
# swap axis so that channels is the first axis
data = np.moveaxis(np.array(img)[:,:,1:3], -1, 0)
data.shape

(2, 512, 512)

In [44]:
dset_name = img_fpath.name.replace('.png','').replace('simulacao','')
dset_name

'245_0032'

In [45]:
h.create_dataset(dset_name, data=data)

<HDF5 dataset "245_0032": shape (2, 512, 512), type "|u1">

In [46]:
h.close()

In [47]:
h5_fpath.stat().st_size

526336

In [48]:
img_fpath.stat().st_size

392919

So it adds some header to the data, as expected as it drops the png compression.

# Create HDF5 with multiple images

In [57]:
# sym_dir = np.random.choice(syms_dirs)
img_fpaths = np.random.choice(list(raw_data_dir.glob('*/*.png')), 1000, replace=False)
len(np.unique(img_fpaths))

1000

In [65]:
imgs_size = sum([img_fpath.stat().st_size for img_fpath in img_fpaths])
imgs_size

395976074

In [66]:
h5_fpath = project_dir/'data/interim/multiple_renders.hdf5'

with h5py.File(str(h5_fpath), "w") as h:
    for img_fpath in img_fpaths:
        img = o3d.io.read_image(str(img_fpath))

        data = np.moveaxis(np.array(img)[:,:,1:3], -1, 0)

        dset_name = img_fpath.name.replace('.png','').replace('simulacao','')

        h.create_dataset(dset_name, data=data)

h5_size = h5_fpath.stat().st_size
h5_size

524662016

In [69]:
(h5_size - imgs_size) / imgs_size

0.3249841352788401

# Reading speed comparison

In [92]:
h5_fpath = Path('/data/multiple_renders.hdf5')

all_dsets = list()
with h5py.File(str(h5_fpath), "w") as h:
    for img_fpath in img_fpaths:
        img = o3d.io.read_image(str(img_fpath))

        data = np.moveaxis(np.array(img)[:,:,1:3], -1, 0)

        dset_name = img_fpath.name.replace('.png','').replace('simulacao','')
        all_dsets.append(dset_name)

        h.create_dataset(dset_name, data=data)

h5_size = h5_fpath.stat().st_size
h5_size

524662016

In [78]:
%%time

for img_fpath in img_fpaths:
    np.array(o3d.io.read_image(str(img_fpath)))

CPU times: user 5.33 s, sys: 79.5 ms, total: 5.41 s
Wall time: 5.41 s


In [101]:
%%time

with h5py.File(h5_fpath, 'r') as h:
    for dset_name in all_dsets:
        d = h[dset_name][:]

CPU times: user 124 ms, sys: 48.1 ms, total: 172 ms
Wall time: 171 ms


In [100]:
%%time

for dset_name in all_dsets:
    with h5py.File(h5_fpath, 'r') as h:
        d = h[dset_name][:]

CPU times: user 272 ms, sys: 52.1 ms, total: 324 ms
Wall time: 324 ms


So HDF5 is a lot faster than png, which is expected as it doesn't compress the data.

In [104]:
compressed_h5_fpath = Path('/data/multiple_renders_compressed.hdf5')

with h5py.File(str(compressed_h5_fpath), "w") as h:
    for img_fpath in img_fpaths:
        img = o3d.io.read_image(str(img_fpath))

        data = np.moveaxis(np.array(img)[:,:,1:3], -1, 0)

        dset_name = img_fpath.name.replace('.png','').replace('simulacao','')

        h.create_dataset(dset_name, data=data, compression='gzip')

compressed_h5_size = compressed_h5_fpath.stat().st_size
compressed_h5_size

281973280

In [103]:
%%time

for dset_name in all_dsets:
    with h5py.File(compressed_h5_fpath, 'r') as h:
        d = h[dset_name][:]

CPU times: user 2.65 s, sys: 51.8 ms, total: 2.71 s
Wall time: 2.71 s


Yet, even with a better compression than the pngs it is still significantly faster.