In [5]:

# Notes from https://docs.h5py.org/en/stable/quick.html

# h5py lets you store huge amounts of numerical data, and easily manipulate that data from NumPy.  
# For example, you can slice into multi-terabyte datasets stored on disk, as if they were real NumPy arrays

# to save more space can compress hdf5 files. Instructions here:
# https://www.christopherlovell.co.uk/blog/2016/04/27/h5py-intro.html

# hdf5 is optimised for very large datasets


import h5py
import os
import numpy as np

print(os.getcwd())


/Users/apple/Desktop/quant_projects/HDF5


In [15]:
# 'datasets' are like numpy arrays
# 'groups' are containers which hold 'datasets' and act like dictionaries (eg each file has a key)

# keys of 'groups' do follow a file structure



In [8]:
# 'f' is context manager object
f = h5py.File("mytestfile.hdf5", "w")  # makes a new hdf5 file if not there ('w' for write mode:
                        # 'a' for append often more appropriate)



In [17]:
fake_data = np.random.randn(100).reshape(50, 2) # create_dataset doesnt work with this: accepts tuple on line below 
fake_data = (100, )
dset = f.create_dataset("mydataset", fake_data, dtype='i')

In [19]:
dset.name   # shows where this dataset sits in the hdf5 file structure

'/mydataset'

In [20]:
f.name    # 'f' is itself a group, at the top of the structure

'/'

In [None]:
f = h5py.File('mytestfile.hdf5', 'a')  # opening file in append mode

In [21]:
# adding subgroup and dataset to that subgroup
grp = f.create_group("subgroup") 
dset2 = grp.create_dataset("another_dataset", (50,), dtype='f')
dset2.name

'/subgroup/another_dataset'

In [22]:
# adding new subgroup implicitly when adding a dataset
dset3 = f.create_dataset('subgroup2/dataset_three', (10,), dtype='i')
dset3.name

'/subgroup2/dataset_three'

In [25]:
[print(name) for name in f]    # view all subgroups (top of folder, non-recursive)

mydataset
subgroup
subgroup2


[None, None, None]

In [26]:
f.visit(lambda x:print(x))    # view all files recursively using visit()

mydataset
subgroup
subgroup/another_dataset
subgroup2
subgroup2/dataset_three


In [30]:
print(f.keys())
print(f.values())
print(f.items())

<KeysViewHDF5 ['mydataset', 'subgroup', 'subgroup2']>
ValuesViewHDF5(<HDF5 file "mytestfile.hdf5" (mode r+)>)
ItemsViewHDF5(<HDF5 file "mytestfile.hdf5" (mode r+)>)


In [36]:
# can attach attributes (metadata) to individual datasets. These are structured like a dict
dset.attrs['temperature'] = 99.5
print(dset.attrs.keys())        # view keys
print(dset.attrs['temperature'])  # get val


<KeysViewHDF5 ['temperature']>
99.5


In [38]:
f.close()   # close the file: writes changes to disk (until now the changes haven't been saved)

In [45]:
# opening hdf5 file again
f = h5py.File('mytestfile.hdf5', 'a')
f.visit(lambda x:print(x))

mydataset
subgroup
subgroup/another_dataset
subgroup2
subgroup2/dataset_three


In [53]:
# add numpy array
data_to_write = np.random.random(size=(100,20)) 
f.create_dataset("numpy_test2",  data=data_to_write, maxshape=(None,None))  
            # maxshape(None,None) ensures no limit on size of created dataset (default is unclear from 
            # docs and may be unlimited anyway)


<HDF5 dataset "numpy_test2": shape (100, 20), type "<f8">

In [54]:
# extract data as a numpy array
data_output = f['numpy_test2'][:]
print(data_output.shape)
print(type(data_output))

(100, 20)
<class 'numpy.ndarray'>


In [61]:
# appending np array to existing h5py dataset
new_array_to_append = np.random.random(size=(50,20)) 
print(f["numpy_test2"].shape)  # shape before

# adding empty rows
f["numpy_test2"].resize((f["numpy_test2"].shape[0] + new_array_to_append.shape[0]), axis = 0)

print(f["numpy_test2"].shape)  # shape after adding more rows
print(f['numpy_test2'][-2:])   # viewing bottom 2 rows before they're populated

# putting values in those rows
f["numpy_test2"][-new_array_to_append.shape[0]:] = new_array_to_append
print(f['numpy_test2'][-2:])   # viewing bottom 2 rows now they're populated


(400, 20)
(450, 20)
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[6.62907245e-01 4.85871072e-01 9.05875034e-01 3.81596814e-01
  8.36464085e-01 9.94201460e-01 8.01879593e-01 8.59785088e-01
  6.09562413e-01 3.35170721e-02 1.84888052e-01 2.53130979e-01
  8.33279254e-03 4.75287368e-01 2.82348758e-01 7.66505240e-02
  5.78233168e-01 4.59446264e-01 3.54505164e-01 9.32939336e-01]
 [2.82860724e-01 4.17865141e-02 5.92793606e-01 6.10683886e-05
  2.11387466e-01 1.97272352e-01 3.19472313e-01 3.22475727e-01
  1.93800500e-02 2.90371766e-02 2.93951089e-01 8.77412052e-01
  6.32937861e-01 3.61012209e-01 6.23770265e-01 7.95583293e-01
  9.46753998e-01 1.00695129e-02 3.11110836e-01 5.29381822e-01]]


In [67]:
# all the above hasn't been saved on the .hdf5 file on disk yet
f.close()

In [78]:
# could use with: another way to read/write/delete/manipulate files
data_to_write = np.random.random(size=(4,4)) 

with h5py.File("mytestfile.hdf5", "a") as f:
    del f['numpy_f2_floats']              # delete existing dataset of this name, so can overwrite in line below
    f.create_dataset("numpy_f2_floats",  data=data_to_write, maxshape=(None,None), dtype='f2')
    print(f["numpy_f2_floats"][:])
    
# don't need to close() when using 'with'


[[0.4514  0.7266  0.565   0.1891 ]
 [0.10846 0.1633  0.05826 0.7046 ]
 [0.09766 0.0739  0.4036  0.0392 ]
 [0.404   0.911   0.98    0.628  ]]


In [80]:
with h5py.File("mytestfile.hdf5", "a") as f:
    f.visit(lambda x:print(x))

mydataset
numpy_f2_floats
numpy_i2_integers
numpy_test
numpy_test2
subgroup
subgroup/another_dataset
subgroup2
subgroup2/dataset_three


In [81]:
with h5py.File("mytestfile.hdf5", "a") as f:
    print(f['numpy_f2_floats'] > 0.5)

TypeError: '>' not supported between instances of 'Dataset' and 'float'

In [84]:
with h5py.File("mytestfile.hdf5", "a") as f:
    del f['autochunk'] 
    dset = f.create_dataset("autochunk", (4, 4), chunks=True)  # add autochunk param for h5py to automatically
                                    # chunk data for you
    print(f['autochunk'][:])

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [91]:
# supports numpy-style slicing for filters
# Docs warn this gets slow with masks over 1000 values in length (ref: https://docs.h5py.org/en/stable/high/dataset.html#reading-writing-data)
with h5py.File("mytestfile.hdf5", "a") as f:
    vals = f['numpy_f2_floats'][:] > 0.5
    print(f['numpy_f2_floats'][vals])
    
    

[0.7266 0.565  0.7046 0.911  0.98   0.628 ]


In [108]:
# selecting rows based on value in the first column
with h5py.File("mytestfile.hdf5", "a") as f:
    vals = f['numpy_f2_floats'][:,0] > 0.4
    vals = f['numpy_f2_floats'][:][vals]
    print(vals)


[[0.4514 0.7266 0.565  0.1891]
 [0.404  0.911  0.98   0.628 ]]


In [None]:
# Think there should be a way to filter giant dataset by a particular column (this seems
# aligned with what hdf5 is optimised to do)


