In [1]:
import sys
from pathlib import Path
cwd = Path.cwd()
sys.path = [ str(cwd), ] + sys.path

In [2]:
import zarr
import h5py as h5

In [3]:
old_path = 'old.h5ad'
old_h5 = h5.File(old_path, 'r')
zarr.tree(old_h5)

In [4]:
new_path = 'new.h5ad'
new_h5 = h5.File(new_path, 'r')
zarr.tree(new_h5)

In [5]:
from anndata import read_h5ad

In [6]:
from dataclasses import dataclass
from typing import Any

@dataclass
class Obj:
    dict: Any
    default: Any = None

    def __getattr__(self, item):
        if item in self.dict:
            return self.dict[item]
        
        if self.default:
            return getattr(self.default, item)
        
        return self.dict[item]

In [7]:
def load_ad(path):
    ad = read_h5ad(path, backed='r', dask=True)
    X = ad.X.compute()
    coo = X.tocoo()
    rows, cols = coo.nonzero()
    nnz = list(zip(list(rows), list(cols)))
    return Obj(dict(ad=ad, nnz=nnz, obs=ad.obs, var=ad.var), default=ad)

In [8]:
old = load_ad(old_path); old.ad

read_dataframe_legacy: <HDF5 dataset "obs": shape (100,), type "|V25"> (dask True)
read_dataframe_legacy: <HDF5 dataset "var": shape (200,), type "|V24"> (dask True)
Calling ctor: ['filename', 'filemode', 'obs', 'var', 'raw', 'dtype', 'dask']
_init_as_actual: X is None…
lazy compute X, isbacked…
Loading HDF5 tensor: old.h5ad:/X: <HDF5 sparse dataset: format 'csc', shape (100, 200), type '<f4'>
Opening old.h5ad (X): ((0, 1), (0, 1)) ((slice(0, 100, None), slice(0, 200, None)))


AnnData object with n_obs × n_vars = 100 × 200 backed at 'old.h5ad'
    obs: 'label', 'idx²', 'Prime'
    var: 'name', 'sqrt(idx)'

In [9]:
new = load_ad(new_path); new.ad

read_dataframe (new): <HDF5 group "/obs" (4 members)> (dask True)
read_dataframe (new): <HDF5 group "/var" (3 members)> (dask True)
Calling ctor: ['filename', 'filemode', 'obs', 'var', 'raw', 'dtype', 'dask']
_init_as_actual: X is None…
lazy compute X, isbacked…
Loading HDF5 tensor: new.h5ad:/X: <HDF5 sparse dataset: format 'csc', shape (100, 200), type '<f4'>
Opening new.h5ad (X): ((0, 1), (0, 1)) ((slice(0, 100, None), slice(0, 200, None)))




AnnData object with n_obs × n_vars = 100 × 200 backed at 'new.h5ad'
    obs: 'label', 'idx²', 'Prime'
    var: 'name', 'sqrt(idx)'

In [10]:
new.X

lazy compute X, isbacked…
Loading HDF5 tensor: new.h5ad:/X: <HDF5 sparse dataset: format 'csc', shape (100, 200), type '<f4'>


Unnamed: 0,Array,Chunk
Bytes,80.00 kB,80.00 kB
Shape,"(100, 200)","(100, 200)"
Count,3 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 80.00 kB 80.00 kB Shape (100, 200) (100, 200) Count 3 Tasks 1 Chunks Type float32 numpy.ndarray",200  100,

Unnamed: 0,Array,Chunk
Bytes,80.00 kB,80.00 kB
Shape,"(100, 200)","(100, 200)"
Count,3 Tasks,1 Chunks
Type,float32,numpy.ndarray


In [11]:
new.nnz == old.nnz

True

In [12]:
old.obs.compute()

Unnamed: 0_level_0,label,idx²,Prime
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,row 0,0,True
1,row 1,1,True
10,row 10,100,False
11,row 11,121,True
12,row 12,144,False
...,...,...,...
95,row 95,9025,False
96,row 96,9216,False
97,row 97,9409,True
98,row 98,9604,False


In [13]:
new.obs.compute()

Unnamed: 0,label,idx²,Prime
0,row 0,0,True
1,row 1,1,True
2,row 2,4,True
3,row 3,9,True
4,row 4,16,True
...,...,...,...
95,row 95,9025,False
96,row 96,9216,False
97,row 97,9409,True
98,row 98,9604,False
