In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
import h5py
import numpy as np
import open3d as o3d

from dotenv import load_dotenv, find_dotenv
from pathlib import Path

# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()
project_dir = Path(dotenv_path).parent

# load up the entries as environment variables
load_dotenv(dotenv_path)

True

# Load image

In [None]:
raw_data_dir = project_dir/'data/raw/render_results'

syms_dirs = list(raw_data_dir.glob('simulacao*'))

# Create HDF5

In [3]:
sym_dir = np.random.choice(syms_dirs)
img_fpath = np.random.choice(list(sym_dir.glob('*.png')))
img_fpath

PosixPath('/home/ctc_das/Desktop/part_counting/data/raw/render_results/simulacao245/simulacao245_0032.png')

In [4]:
img = o3d.io.read_image(str(img_fpath))
img

Image of size 512x512, with 4 channels.
Use numpy.asarray to access buffer data.

In [42]:
h5_fpath = project_dir/'data/interim/single_render.hdf5'

# TODO: explore chunking (see https://docs.h5py.org/en/stable/high/file.html#chunk-cache
# and https://portal.hdfgroup.org/display/HDF5/Chunking+in+HDF5)
# it may be useful for keeping the hdf5 in HDD
h = h5py.File(str(h5_fpath), "w")
h

<HDF5 file "single_render.hdf5" (mode r+)>

In [43]:
# swap axis so that channels is the first axis
data = np.moveaxis(np.array(img)[:,:,1:3], -1, 0)
data.shape

(2, 512, 512)

In [44]:
dset_name = img_fpath.name.replace('.png','').replace('simulacao','')
dset_name

'245_0032'

In [45]:
h.create_dataset(dset_name, data=data)

<HDF5 dataset "245_0032": shape (2, 512, 512), type "|u1">

In [46]:
h.close()

In [47]:
h5_fpath.stat().st_size

526336

In [48]:
img_fpath.stat().st_size

392919

So it adds some header to the data, as expected as it drops the png compression.

# Create HDF5 with multiple images

In [172]:
# sym_dir = np.random.choice(syms_dirs)
img_fpaths = np.random.choice(list(raw_data_dir.glob('*/*.png')), 1010, replace=False)
len(np.unique(img_fpaths))

1010

In [173]:
imgs_size = sum([img_fpath.stat().st_size for img_fpath in img_fpaths])
imgs_size

399427682

In [174]:
h5_fpath = project_dir/'data/interim/multiple_renders.hdf5'

with h5py.File(str(h5_fpath), "w") as h:
    for img_fpath in img_fpaths:
        img = o3d.io.read_image(str(img_fpath))

        data = np.moveaxis(np.array(img)[:,:,1:3], -1, 0)

        dset_name = img_fpath.name.replace('.png','').replace('simulacao','')

        h.create_dataset(dset_name, data=data)

h5_size = h5_fpath.stat().st_size
h5_size

529909264

In [175]:
(h5_size - imgs_size) / imgs_size

0.32667135474100667

# Reading speed comparison

In [176]:
h5_fpath = Path('/data/multiple_renders.hdf5')

all_dsets = list()
with h5py.File(str(h5_fpath), "w") as h:
    for img_fpath in img_fpaths:
        img = o3d.io.read_image(str(img_fpath))

        data = np.moveaxis(np.array(img)[:,:,1:3], -1, 0)

        dset_name = img_fpath.name.replace('.png','').replace('simulacao','')
        all_dsets.append(dset_name)

        h.create_dataset(dset_name, data=data)

h5_size = h5_fpath.stat().st_size
h5_size

529909264

In [177]:
%%time

for img_fpath in img_fpaths:
    np.array(o3d.io.read_image(str(img_fpath)))

CPU times: user 5.38 s, sys: 83.4 ms, total: 5.47 s
Wall time: 5.47 s


In [178]:
%%time

with h5py.File(h5_fpath, 'r') as h:
    for dset_name in all_dsets:
        d = h[dset_name][:]

CPU times: user 100 ms, sys: 75.2 ms, total: 176 ms
Wall time: 175 ms


In [179]:
%%time

for dset_name in all_dsets:
    with h5py.File(h5_fpath, 'r') as h:
        d = h[dset_name][:]

CPU times: user 261 ms, sys: 83.3 ms, total: 345 ms
Wall time: 344 ms


So HDF5 is a lot faster than png, which is expected as it doesn't compress the data.

In [180]:
compressed_h5_fpath = Path('/data/multiple_renders_compressed.hdf5')

with h5py.File(str(compressed_h5_fpath), "w") as h:
    for img_fpath in img_fpaths:
        img = o3d.io.read_image(str(img_fpath))

        data = np.moveaxis(np.array(img)[:,:,1:3], -1, 0)

        dset_name = img_fpath.name.replace('.png','').replace('simulacao','')

        h.create_dataset(dset_name, data=data, compression='gzip')

compressed_h5_size = compressed_h5_fpath.stat().st_size
compressed_h5_size

284132486

In [181]:
%%time

for dset_name in all_dsets:
    with h5py.File(compressed_h5_fpath, 'r') as h:
        d = h[dset_name][:]

CPU times: user 2.69 s, sys: 54.7 ms, total: 2.74 s
Wall time: 2.74 s


Yet, even with a better compression than the pngs it is still significantly faster.

# Test multiple images in dataset

Also chunkable and resizable.

In [182]:
syms_dirs_sample = np.random.choice(syms_dirs, 10)
len(syms_dirs_sample)

10

In [183]:
imgs_size = sum([img_fpath.stat().st_size for sym_dir in syms_dirs_sample for img_fpath in sym_dir.glob('*.png')])
imgs_size

399693722

In [261]:
h5_fpath = Path('/data/single_dataset.hdf5')

with h5py.File(str(h5_fpath), "w") as h:
    dset = h.create_dataset(
        'renders',
        (len(syms_dirs_sample),101,2,512,512),
        dtype='uint8',
        chunks=(1,1,2,512,512),
    )
    for i, sym_dir in enumerate(syms_dirs_sample):
        for img_fpath in sym_dir.glob('*.png'):
            img = o3d.io.read_image(str(img_fpath))

            data = np.moveaxis(np.array(img)[:,:,1:3], -1, 0)

            data_i = int(img_fpath.name.replace('.png','').split('_')[-1])

            dset[i,data_i,:,:,:] = data

h5_size = h5_fpath.stat().st_size
h5_size

529628328

In [262]:
%%time

for i in range(len(syms_dirs_sample)):
    for j in range(101):
        with h5py.File(h5_fpath, 'r') as h:
            d = h['renders']
            d[i,j]

CPU times: user 308 ms, sys: 59.1 ms, total: 368 ms
Wall time: 367 ms


In [264]:
%%time

h5_fpath = Path('/data/single_dataset_resizable.hdf5')

with h5py.File(str(h5_fpath), "w") as h:
    dset = h.create_dataset(
        'renders',
        (0,101,2,512,512),
        maxshape=(None,101,2,512,512),
        dtype='uint8',
        chunks=(1,1,2,512,512),
    )
    for i, sym_dir in enumerate(syms_dirs_sample):
        dset.resize(i+1,axis=0)
        for img_fpath in sym_dir.glob('*.png'):
            img = o3d.io.read_image(str(img_fpath))

            data = np.moveaxis(np.array(img)[:,:,1:3], -1, 0)

            data_i = int(img_fpath.name.replace('.png','').split('_')[-1])

            dset[i,data_i,:,:,:] = data

h5_size = h5_fpath.stat().st_size
h5_size

CPU times: user 6.16 s, sys: 0 ns, total: 6.16 s
Wall time: 8.61 s


529628328

In [287]:
%%time

for i in range(len(syms_dirs_sample)):
    for j in range(101):
        with h5py.File(h5_fpath, 'r') as h:
            d = h['renders']
            d[i,j]

CPU times: user 363 ms, sys: 0 ns, total: 363 ms
Wall time: 363 ms


In [286]:
%%time

for i in range(len(syms_dirs_sample)):
    for j in range(101):
        with h5py.File(project_dir/'data/interim/single_dataset_resizable.hdf5', 'r') as h:
            d = h['renders']
            d[i,j]

CPU times: user 375 ms, sys: 0 ns, total: 375 ms
Wall time: 374 ms


## With compression

In [266]:
h5_fpath = Path('/data/compressed_chunked_dataset.hdf5')

with h5py.File(str(h5_fpath), "w") as h:
    dset = h.create_dataset(
        'renders',
        (len(syms_dirs_sample),101,2,512,512),
        dtype='uint8',
        chunks=(1,1,2,512,512),
        compression='gzip',
    )
    for i, sym_dir in enumerate(syms_dirs_sample):
        for img_fpath in sym_dir.glob('*.png'):
            img = o3d.io.read_image(str(img_fpath))

            data = np.moveaxis(np.array(img)[:,:,1:3], -1, 0)

            data_i = int(img_fpath.name.replace('.png','').split('_')[-1])

            dset[i,data_i,:,:,:] = data

h5_size = h5_fpath.stat().st_size
h5_size

282874222

In [276]:
%%time

for i in range(len(syms_dirs_sample)):
    for j in range(101):
        with h5py.File(h5_fpath, 'r') as h:
            d = h['renders']
            d[i,j]

CPU times: user 2.3 s, sys: 0 ns, total: 2.3 s
Wall time: 2.3 s


In [277]:
%%time

for i in range(len(syms_dirs_sample)):
    for j in range(101):
        with h5py.File(project_dir/'data/interim/compressed_chunked_dataset.hdf5', 'r') as h:
            d = h['renders']
            d[i,j]

CPU times: user 2.32 s, sys: 0 ns, total: 2.32 s
Wall time: 2.32 s
