In [103]:
import h5py
import pandas as pd
import dask.dataframe as dd
import dask.array as da
import numpy as np
import zarr
import numcodecs
import string
import pickle

from numpy.lib.recfunctions import unstructured_to_structured
from Orange.data import Table, Domain


### Example 1 - HDF5

Covert metas into a structured array and store them as such in HDF5 file format.

Prior to storing strings on a disk, we need to encode them. 
[From H5PY docs:](https://docs.h5py.org/en/stable/strings.html#strings)

> When writing data to an existing dataset or attribute, data passed as bytes are written without checking the encoding. Data passed as Python str objects are encoded as either ASCII or UTF-8, based on the HDF5 datatype.

This means that we have to be aware of the encoding when reading data from the disk.

In [109]:
example_domain = Table('iris').domain
data = np.random.random((50000, 10000))
codes = np.random.choice(list(string.printable) + list('ČĆŽĐŠžćčđš'), size=[50000, 10])
ecoded_string = np.char.encode([''.join(code) for code in codes], encoding='iso8859_2')
metas = np.column_stack((np.random.randint(2, size=50000), np.random.randint(100, size=50000),  np.array(ecoded_string, dtype=object)))

structured_metas = np.array([tuple(row) for row in metas], dtype=np.dtype([('meta1', int), ('meta2', float), ('meta3', h5py.string_dtype())]))
with h5py.File('example-1.h5', 'w') as f:
    f.create_dataset('X', data=data)
    f.create_dataset('metas', data=structured_metas)
    f.create_dataset('domain', data=np.void(pickle.dumps(example_domain)))

In [86]:
with h5py.File('example-1.h5', 'r') as f:
    X = da.from_array(f['X'])
    # when creating a Dask array, we have to specify chunk size. Dask can't perform auto-chunking when given object data.
    # NotImplementedError: Can not use auto rechunking with object dtype. We are unable to estimate the size in bytes of object data
    M = da.from_array(f['metas'], chunks=(-1,))
    # We can also read this as a Dask data frame:
    M_df = dd.from_array(f['metas'])

    # encoding/decoding sanity check
    a = [''.join(code) for code in codes]
    b = [x.decode('iso8859_2') for x in f['metas']['meta3'][()]]
    print(a == b)

# print(X, M, M_df, domain)

True


### Example 2 - Zarr

Zarr file format can be a drop-in replacement for HDF5 with a similar API. It supports arrays of type Object by default; you only need to specify how it will be encoded/decoded (see example below).

Zarr allows us to read the .metas as object arrays directly into Dask, so in theory, no significant changes will be needed to support this in Orange. For example, if we stick with HDF, we will have to take into account an alternative representation of metas, that is, structured arrays.

Things that we should be aware of:

- Zarr (by default) reads/writes data into a directory and not a single file. [The storage alternative](https://zarr.readthedocs.io/en/stable/tutorial.html?highlight=chunk#storage-alternatives) that is interesting for us at this point is using a ZIP file (I don't know if there are noticeable performance differences between the two). In the example below, we zip the folder with ```7z a -tzip example-2.zarr.zip example-2.zarr/.``` and open the file as we would usually do.


- When using Zarr, we must store the domain as a dataset because internally, Zarr uses JSON to store array attributes, so attribute values must be JSON serializable.

In [174]:
example_domain = Table('iris').domain
data = np.random.random((50000, 10000))
codes = np.random.choice(list(string.printable) + list('ČĆŽĐŠžćčđš'), size=[50000, 10])
metas = np.column_stack((np.random.randint(2, size=50000), np.random.randint(100, size=50000),  np.array([''.join(code) for code in codes], dtype=object)))

with zarr.open('example-2.zarr', 'w') as f:
    table = f.create_group('table')
    table.create_dataset('X', data=data)
    table.create_dataset('metas', data=metas, object_codec=numcodecs.MsgPack())

    # Store domain variables in separate datasets in groups. An array of variables are read from the disk.
    domain = f.create_group('domain')
    domain.create_dataset('attributes', data=example_domain.attributes, object_codec=numcodecs.Pickle())
    domain.create_dataset('class_vars', data=example_domain.class_vars, object_codec=numcodecs.Pickle())
    domain.create_dataset('metas', data=example_domain.metas, object_codec=numcodecs.Pickle())

    # Alternativly we could also use:
    domain = f.create_dataset('domain_2', data=pickle.dumps(example_domain))
    # or
    domain = f.create_dataset('domain_3', data=example_domain, object_codec=numcodecs.Pickle())


# or use zarr.zip
with zarr.open('example-2.zarr', 'r') as f:
    X = da.from_array(f['table']['X'])
    M = da.from_array(f['table']['metas'], chunks=(-1, 1))
    # We can also read this as a Dask data frame:
    M_df = dd.from_array(f['table']['metas'])


    domain = Domain(f['domain']['attributes'][()],  # Domain
                    f['domain']['class_vars'][()], 
                    f['domain']['metas'][()])

    domain_2 = pickle.loads(f['domain_2'][()])  # Domain
    domain_3 = f['domain_3'][()] # This is a an array of domain varaibles.