In [50]:


## Notes from O'Reilly Python and HDF5 book


# note at end of this article on loading hdf5 into pytorch:
# https://blade6570.github.io/soumyatripathy/hdf5_blog.html


import numpy as np
import h5py
from timeit import timeit
import os
from pathlib import Path
from PIL import Image


In [None]:
temperature = np.random.random(1024)

In [31]:
# h5py uses C
# B-tree: tree where each branch connects to nodes with 2+ children
    # (instead of binary-tree where branch has exactly 2 children)
    
# 'groups' are stored in a B-tree, giving them the hierarchical format

# suggests just use h5py as pacakge does it (not optimising further) unless you reallllly need to (which
# I probably won't)

# context managers set up and teardown resources (such as file connections). Generally for 
# more computationally expensive resources

# driver types:
# Core driver, stores all data in memory
# f = h5py.File("name.hdf5", driver="core")   # to set context manager with core specified

# Family driver, splits data into chunks and one chunk in memory at a time
# f = h5py.File("family.hdf5", driver="family", memb_size=1024**3)  # 1gb at a time

# mpio driver: for accessing data from many cores at the same time (related to 'Parallel HDF5')

# 'locality': when reading off disk it's faster when the data is all stored as closely as possible

# 'chunking': specify n-dimensional shape that fits you access pattern

# chunks can be edited (compressed or decompressed) on their way out of retrival 
# dont need to do anything different for a chunked dataset except specify that it
# chunked originally

# says chunks over 1mb is bad idea as this is limit for it to use fast in-memory 'chunk cache'

# 'filter pipeline' = series operations performed on a chunk before it's written

# says rare for an application to spend most of its time compressing or decompressing data, 
# so try not to get carried away with speed testing







In [42]:
# set chunks=True to enable automatic chunking, where it decides chunk size for you
dse2t = f.create_dataset("Images2d", (100,480,640), 'f', chunks=True, compression='gzip')
print(dse2t.chunks)
print(dse2t.compression)

(7, 30, 80)
gzip


In [43]:
# set shuffle=True to enable shuffle prior to compression, saving space
dse2t = f.create_dataset("Images2d_2", (100,480,640), 'f', chunks=True, compression='gzip', shuffle=True)
print(dse2t.chunks)
print(dse2t.compression)
print(dse2t.shuffle)

# time to perform shuffle is negligible compared to compression, so cant see why you wouldnt do it

(7, 30, 80)
gzip
True


In [None]:
# LZF compression is a good alternative to gzip if working solely on python, and you dont mind
# larger file size in exchange for faster compression/decompression

# 

In [44]:
# FLETCHER32 = 32bit implementation of fletcher's checksum
print(dse2t.fletcher32)

# to enable fletcher32 checksum
# dset = myfile.create_dataset("Data2", (1000,), fletcher32=True)


False


In [None]:
# the BLOSC compressor used by the PyTables project is highly tuned for speed

In [34]:
# storing float64 data as float32 (converting) in hdf5
bigdata = np.ones((100,1000))
with h5py.File('big2.hdf5','w') as f2:
    f2.create_dataset('big', data=bigdata, dtype=np.float32)

f2 = h5py.File("big2.hdf5")
print(f2['big'].dtype)

f2.close()   # wouldnt expect to need this but file doesnt seem to close without it, even with 'with' above


float32


In [None]:
#dset4 = f["SubGroup/Dataset4"] # Right
#dset4 = f["SubGroup"]["Dataset4"]   # works but inefficient

In [20]:
# load australia photos into h5py
fpath = Path('Python and HDF5 notes.ipynb').absolute()
pathway = os.path.dirname(fpath)

file_up_one_level = os.path.split(pathway)[0]
os.chdir(file_up_one_level  + '/australia_photos')
print(os.listdir()[:5])


f = h5py.File("australia.hdf5", 'w-')   # w- makes new file but errors on attempt to overwrite

['P8020078.JPG', 'P8020087.JPG', 'P8110178.JPG', 'P7310037.JPG', 'P8100169.JPG']


In [46]:
# make subgroup to store images
f = h5py.File("australia.hdf5", 'a')
subgroup = f.create_group("aus_iages")

In [48]:
print(subgroup)

<HDF5 group "/aus_iages" (0 members)>

In [56]:
# now load the photos into hdf5
files = os.listdir()

with h5py.File('australia.hdf5','a') as f2:
    for file_path in files:
        img_np = np.array(Image.open(file_path))
        print(img_np.shape)
        obj_name_new = file_path.split(',', 1)[0]
        img_name = "/aus_iages/" + file_path
        f2.create_dataset(img_name, data=img_np, dtype=np.float32, shuffle=True,chunks=True,compression='gzip')

# this appears to be shockingly ineffienct to store vs jpeg (at least 6mb vs just under 1mb per image)

(2272, 1704, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1620, 2221, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(2272, 1704, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(2272, 1704, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1619, 2217, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1544, 2187, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1588, 2264, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(2272, 1704, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1557, 2200, 3)
(1608, 2235, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(1704, 2272, 3)
(2272, 1704, 3)


UnidentifiedImageError: cannot identify image file '.DS_Store'