### h5py package to create HDF5 file

Link: http://docs.h5py.org/en/latest/mpi.html

An HDF5 file is a container for two kinds of objects: 
- datasets, which are array-like collections of data, 
- groups, which are folder-like containers that hold datasets and other groups.

where: Groups work like dictionaries, and datasets work like NumPy arrays

In [17]:
import h5py
import numpy as np

In [27]:
!rm mytestfile.hdf5

In [28]:
# create a new hdf5 file
f = h5py.File("mytestfile.hdf5", "w")

In [100]:
f.filename, f.name

('mytestfile.hdf5', '/')

#### dataset

In [29]:
# you can create group or dataset into the file
# Using existing data:
data = np.random.logistic(size=100)
dsetdata = f.create_dataset("dsetdata", data=data)
print(dsetdata)

# Creating the dataset without data
dsetname = f.create_dataset(
    "dsetname", shape=(100, ), dtype='i')  # i, f, etc..
print(dsetname)

<HDF5 dataset "dsetdata": shape (100,), type "<f8">
<HDF5 dataset "dsetname": shape (100,), type "<i4">


In [35]:
dsetdata.dtype, dsetdata.shape, dsetdata.size

(dtype('float64'), (100,), 100)

In [99]:
# check data in filename
for i in f.items():
    print(i)

('dsetdata', <HDF5 dataset "dsetdata": shape (100,), type "<f8">)
('dsetname', <HDF5 dataset "dsetname": shape (100,), type "<i4">)


In [60]:
# support array-style slicing, example of read/write in this cell
dsetname[0] = 5
dsetname[5:15] = np.random.uniform(size=10,low=0, high=5)
dsetname[10:20:2]

array([2, 1, 4, 0, 0], dtype=int32)

In [58]:
dsetname.value, dsetname.name, dsetname.fillvalue, dsetname.shuffle

(array([5, 0, 0, 0, 0, 4, 4, 4, 4, 0, 1, 3, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], dtype=int32), '/dsetname', 0, False)

#### groups

In [106]:
# “HDF” stands for “Hierarchical Data Format”.
# Every object in an HDF5 file has a name, and they’re arranged in a POSIX-style hierarchy with /-separators
print(f.name, dsetname.name, dsetdata.name)

# create subgroup
grp = f.create_group("subgroup")

# create dataset from the group variable
dset2 = grp.create_dataset("another_dataset", (50, ), dtype='f')
dset2.name

/ /dsetname /dsetdata


'/subgroup/another_dataset'

In [107]:
# giving a full path, each group will be created
dset3 = f.create_dataset('subgroup2/dataset_three', (10,), dtype='i')
dset3.name

'/subgroup2/dataset_three'

In [109]:
# get dataset using full path
dataset_three = f['subgroup2/dataset_three']
dataset_three

<HDF5 dataset "dataset_three": shape (10,), type "<i4">

In [113]:
"subgroup/another_dataset" in f

True

In [114]:
# wrong way to check the entire tree structure
for name in f:
    print(name)

dsetdata
dsetname
subgroup
subgroup2


In [132]:
# iterate over the file or group using visit() or visititems() which both take a callable
def printname(x):
    print(x)


f.visit(printname)
print('')
grp.visit(printname)

dsetdata
dsetname
subgroup
subgroup/another_dataset
subgroup2
subgroup2/dataset_three

another_dataset


#### metadata

In [87]:
# you can store metadata right next to the data (groups and datasets) it describes (in a dictionary interface)
dsetname.attrs['descrizione'] = 'dati a caso'
dsetname.attrs['data'] = '04/04/2014'
dsetname.attrs['pippo'] = 150
'data' in dsetname.attrs

True

In [92]:
# see all metadata
for i in dsetname.attrs.items():
    print(i)

('descrizione', 'dati a caso')
('data', '04/04/2014')
('pippo', 150)


In [166]:
f.close()

#### From pandas dataframe to hdf5 and viceversa

In [239]:
!rm iris.hdf5

In [240]:
import pandas as pd
df = pd.read_csv('iris.csv')
df.to_hdf(path_or_buf='iris.hdf5', key='iris_', mode='w', format='fixed')
# fixed format: Fast writing/reading. Not-appendable, nor searchable 
# table forma: Write as a PyTables Table structure which may perform worse 
#              but allow more flexible operations like searching selecting subsets of the data

In [231]:
# reopen the file and check the structure and how it has stored the columns etc..
newf = h5py.File('iris.hdf5')

newf.visit(printname)

iris_
iris_/axis0
iris_/axis1
iris_/block0_items
iris_/block0_values
iris_/block1_items
iris_/block1_values
iris_/block2_items
iris_/block2_values


In [232]:
for i in newf.keys():
    for j in newf[i].keys():
        print(newf[i + '/' + j])

<HDF5 dataset "axis0": shape (7,), type "|S12">
<HDF5 dataset "axis1": shape (150,), type "<i8">
<HDF5 dataset "block0_items": shape (4,), type "|S12">
<HDF5 dataset "block0_values": shape (150, 4), type "<f8">
<HDF5 dataset "block1_items": shape (2,), type "|S10">
<HDF5 dataset "block1_values": shape (150, 2), type "<i8">
<HDF5 dataset "block2_items": shape (1,), type "|S7">
<HDF5 dataset "block2_values": shape (1,), type "|O">


In [233]:
for i in newf.keys():
    for j in newf[i].keys():
        print(newf[i + '/' + j].name)
        print(newf[i + '/' + j].value, end='\n\n')

/iris_/axis0
[b'Unnamed: 0' b'sepal_length' b'sepal_width' b'petal_length'
 b'petal_width' b'species' b'species_id']

/iris_/axis1
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149]

/iris_/block0_items
[b'sepal_length' b'sepal_width' b'petal_length' b'petal_width']

/iris_/block0_values
[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]
 [ 5.4  3.9  1.7  0.4]
 [ 4.6  3.4  1

In [234]:
newf.close()

In [235]:
# read an hdf5 file to pandas dataframe
df_hdf5 = pd.read_hdf('iris.hdf5')
df_hdf5.head()

Unnamed: 0.1,Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id
0,0,5.1,3.5,1.4,0.2,setosa,1
1,1,4.9,3.0,1.4,0.2,setosa,1
2,2,4.7,3.2,1.3,0.2,setosa,1
3,3,4.6,3.1,1.5,0.2,setosa,1
4,4,5.0,3.6,1.4,0.2,setosa,1


In [245]:
## append data to the previous data on the hdf5
df.to_hdf('iris.hdf5', 'data', append=True, format='table')
df.to_hdf('iris.hdf5', 'data', append=True, format='table') # since the format is table you can append stak data

In [248]:
# to retrive the data you have to use the key (here data or iris_)
df_hdf5 = pd.read_hdf('iris.hdf5',key='data')
print(len(df_hdf5))
df_hdf5

300


Unnamed: 0.1,Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id
0,0,5.1,3.5,1.4,0.2,setosa,1
1,1,4.9,3.0,1.4,0.2,setosa,1
2,2,4.7,3.2,1.3,0.2,setosa,1
3,3,4.6,3.1,1.5,0.2,setosa,1
4,4,5.0,3.6,1.4,0.2,setosa,1
5,5,5.4,3.9,1.7,0.4,setosa,1
6,6,4.6,3.4,1.4,0.3,setosa,1
7,7,5.0,3.4,1.5,0.2,setosa,1
8,8,4.4,2.9,1.4,0.2,setosa,1
9,9,4.9,3.1,1.5,0.1,setosa,1


### HDF5 also support (check first link):
- parallelism
- chunked storage
- compression
- dimension scale