In [1]:
import os

import h5py
import dask.array as da
import numpy as np

# Breast cancer tissue microarray from Mayerich
# https://stim.ee.uh.edu/resources/data-sets/

# the file was converted from hdr to hdf5 with better reading with
# dask, see case-studies-scripts/spectra-breast-cancer-pca.ipynb

fn = "/slow/marko/stone/n/brc961-br1001.hdf5"  # 116 GB

f = h5py.File(fn, "r")
data = f["data"]
x = da.from_array(data, chunks=(100, 100, -1))

x = x[:1000, :2000, :]
wns = np.array(f["wavelength"])

print("data size:", x.size*4/1e9, "GB")
print("x:", x)
print("wns:", wns)

data size: 13.008 GB
x: dask.array<getitem, shape=(1000, 2000, 1626), dtype=float32, chunksize=(100, 100, 1626), chunktype=numpy.ndarray>
wns: [ 750.  752.  754. ... 3996. 3998. 4000.]


In [2]:
import os
import pickle

import Orange
from Orange.data import Domain, Table, ContinuousVariable


def metatable_maplocs(x_locs, y_locs):
    """ Create an Orange table containing (x,y) map locations as metas. """
    x_locs = np.asarray(x_locs)
    y_locs = np.asarray(y_locs)
    metas = np.vstack((x_locs, y_locs)).T

    domain = Domain([], None,
                    metas=[ContinuousVariable.make("map_x"),
                           ContinuousVariable.make("map_y")]
                    )
    data = Table.from_numpy(domain, X=np.zeros((len(metas), 0)),
                            metas=np.asarray(metas, dtype=object))
    return data


def spectra_from_image(X, features, x_locs, y_locs):
    # each spectrum has its own row
    spectra = X.reshape((X.shape[0]*X.shape[1], X.shape[2]))

    # locations
    y_loc = np.repeat(np.arange(X.shape[0]), X.shape[1])
    x_loc = np.tile(np.arange(X.shape[1]), X.shape[0])
    meta_table = metatable_maplocs(x_locs[x_loc], y_locs[y_loc])

    return features, spectra, meta_table

In [3]:
%%time

sx = x[200:400, 200:400, :]

def create_orange_table(filename, x, wns, xcoords, ycoords):
    if os.path.exists(filename):
        os.unlink(filename)

    wns, x, meta = spectra_from_image(x, wns, xcoords, ycoords)

    da.to_hdf5(filename, "/X", x)

    atts = [Orange.data.ContinuousVariable("%f" % wn) for wn in wns]

    domain = Orange.data.Domain(atts, meta.domain.class_vars, meta.domain.metas)

    with h5py.File(filename, 'r+') as f:
        f.create_dataset("Y", data=meta.Y)
        f.create_dataset("domain", data=np.void(pickle.dumps(domain)))
        f.create_dataset("metas", data=np.void(pickle.dumps(meta.metas)))

create_orange_table("/slow/marko/stone/n/brc961-br1001-orange-small.hdf5", sx, wns, np.arange(200, 400), np.arange(200, 400))

CPU times: user 469 ms, sys: 1.56 s, total: 2.03 s
Wall time: 11.4 s


In [4]:
%%time
create_orange_table("/slow/marko/stone/n/brc961-br1001-orange.hdf5", x, wns, np.arange(x.shape[1]), np.arange(x.shape[0]))

CPU times: user 39.9 s, sys: 1min 25s, total: 2min 5s
Wall time: 9min 57s
