## Found how to get partial_decompress=True from zarr_python library code, file "build/lib/zarr/tests/test_core.py" test function test_read_nitems_less_than_blocksize_from_multiple_chunks in line 2683

In [1]:
import zarr # Check version
zarr.__version__

'2.13.0a2.dev42+dirty'

Zarr Sharding pull request should show '0.0.0'

If it says e.g. 2.13.2, then you're running on the "proper" published version (without Sharding support as of 28-Sept-2022)

### 1. I wanna test partial_decompression by accessing a random index

In [25]:
import numpy as np


numpy_arr = np.empty((65536, 16384), dtype=np.int32) # ~3GB
original_shape = numpy_arr.shape

# 50k x 150k int32 is limit of M1 Pro 32GB
for i in range(numpy_arr.shape[0]):
#     numpy_arr[i][:] = list(range(original_shape[1]*i, original_shape[1]*(i+1)))
    numpy_arr[i][:] = np.random.randint(low=0, high=original_shape[0], size=original_shape[1])

In [3]:
print("Memory used: ", numpy_arr.nbytes / 1024 / 1024 / 1024, " GB")

Memory used:  4.0  GB


In [4]:
numpy_arr[0][0:10]

array([15383, 23676, 56778, 54049, 29433,  9470, 49877, 58731, 41759,
       12500], dtype=int32)

### Following code in zarr test_core.py:2653

In [5]:
# All these extra parameters are irrelevant to _partial_decompress being enabled. Path is enough
# fsstore = zarr.storage.FSStore('./fsstore', key_separator="/", auto_mkdir=True, mode='w', normalize_keys=False)

fsstore = zarr.storage.FSStore('./fsstore')

In [6]:
# chunk_size=(4096,4096)

zarr_index_list = zarr.array(numpy_arr, store=fsstore)#, chunk_store = fsstore_chunk, partial_decompress=True, chunks=chunk_size, dtype='i4')# compressor=weak_compressor, 

In [7]:
zarr_index_list.info

0,1
Type,zarr.core.Array
Data type,int32
Shape,"(65536, 16384)"
Chunk shape,"(2048, 512)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.FSStore
No. bytes,4294967296 (4.0G)
No. bytes stored,2156249432 (2.0G)


In [8]:
print("_partial_decompress: ", zarr_index_list._partial_decompress)

_partial_decompress:  False


<font color="orange">As we can see, partial_decompress is False. Don't panic!</font>

In [26]:
# Flush to disk
zarr.save(fsstore, numpy_arr)

In [27]:
# Load from disk

# read_only doesn't affect partial_decompress eligibility
load_partial_decomp = zarr.Array(fsstore, read_only=True, partial_decompress=True)

In [28]:
load_partial_decomp._partial_decompress

True

#### <font color="orange">Solved! Partial_decompress=True</font>

In [29]:
load_partial_decomp.info

0,1
Type,zarr.core.Array
Data type,int32
Shape,"(65536, 16384)"
Chunk shape,"(2048, 512)"
Order,C
Read-only,True
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.FSStore
No. bytes,4294967296 (4.0G)
No. bytes stored,2156249432 (2.0G)


In [14]:
del numpy_arr # Don't need this anymore - save 3GB

Sometimes chunks not even initialized? Sometimes 2 arrays are written to disk (i.e. a group). Not sure why

In [33]:
load_partial_decomp[0, 100:105]

array([36124, 38441, 30424, 59465, 21410], dtype=int32)

In [31]:
load_disk_normally = zarr.open(fsstore) # , partial_decompress=True) -- This won't work !!

In [32]:
load_disk_normally._partial_decompress

False

In [34]:
load_disk_normally[0, 100:105]

array([36124, 38441, 30424, 59465, 21410], dtype=int32)

In [36]:
# Do the two match?
(load_partial_decomp[0, 100:105] == load_disk_normally[0, 100:105]).all()

True

### Let's test random access of chunks by index

In [41]:
dir(load_partial_decomp)

['__array__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_append_nosync',
 '_attrs',
 '_cache_metadata',
 '_cdata_shape',
 '_chunk_delitem',
 '_chunk_delitems',
 '_chunk_getitem',
 '_chunk_getitems',
 '_chunk_key',
 '_chunk_setitem',
 '_chunk_setitem_nosync',
 '_chunk_setitems',
 '_chunk_store',
 '_chunks',
 '_compressor',
 '_decode_chunk',
 '_dimension_separator',
 '_dtype',
 '_encode_chunk',
 '_fill_value',
 '_filters',
 '_flush_metadata_nosync',
 '_get_basic_selection_nd',
 '_get_basic_selection_zd',
 '_get_selection',
 '_info_items_nosync',
 '_info_reporter',
 '_is_view

In [46]:
chunk_shape = (2048, 512)

In [None]:
rand_indices_list = np.random.randint(low=0, high=original_shape[0], size=50)

In [None]:
%%timeit
# Read randomly from first chunk, Using partial decompression

for index in rand_indices:
    load_partial_decomp[0, index]

In [53]:
%%timeit
# Read randomly from first chunk, Not using partial decompression

for index in rand_indices:
    load_disk_normally[0, index]

629 ms ± 8.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### <font color="orange">Let's access the chunks using 2D accesses now</font>

In [51]:
rand_indices_tuples = (np.random.randint(low=0, high=chunk_shape[0], size=50), np.random.randint(low=0, high=chunk_shape[1], size=50))

In [52]:
rand_indices_tuples

(array([ 976, 1070, 1339, 1454, 1365, 1397, 1104,  589,  217, 1249, 1880,
         826, 1754, 1320, 1930,  122, 1428,  417,    4,  514, 1970,   74,
         208,  267, 1929, 1986,  874, 1122,  410,  909, 1814,  911,  592,
         458,  709,  501,  288,   77, 1881, 1123, 1744, 1936,  184, 1067,
        1489,  508,  230, 1520, 1068,  503]),
 array([145, 420, 176, 277, 175, 251, 311, 146, 375, 352, 503, 172, 425,
        315,  30, 135, 389, 243, 462, 373, 199, 156, 345, 298, 269,  36,
        497, 352, 346, 322, 233, 161,  47, 159, 435, 412, 475, 213, 502,
        434, 339,  55, 380, 385, 489,  98,  19, 510, 156, 268]))

In [53]:
rand_indices_tuples[0][1]

1070

TODO set range(50) programmatically

In [58]:
%%timeit
# Read randomly from first chunk, Not using partial decompression

for i in range(50):
    load_partial_decomp[rand_indices_tuples[0][i], rand_indices_tuples[1][i]]

38.8 ms ± 1.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [57]:
%%timeit
# Read randomly from first chunk, Not using partial decompression

for i in range(50):
    load_disk_normally[rand_indices_tuples[0][i], rand_indices_tuples[1][i]]

70.4 ms ± 1.56 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## <font color="orange"> New experiment: access one point per chunk in the whole array</font>

In [69]:
load_partial_decomp.info

0,1
Type,zarr.core.Array
Data type,int32
Shape,"(65536, 16384)"
Chunk shape,"(2048, 512)"
Order,C
Read-only,True
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.FSStore
No. bytes,4294967296 (4.0G)
No. bytes stored,2156249432 (2.0G)


In [75]:
32**2

1024

In [76]:
from itertools import product

In [81]:
rand_indices_whole_array = []

xes = []
ys = []

for i in range(32): # there's 32x32 chunks. TODO get programmatically
    # 2d point access - let's not just access first row of chunk
    
    # Random point across all chunks
#     x = np.random.randint(low=i*chunk_shape[0]+5, high=(i+1)*chunk_shape[0] + 5)
#     y = np.random.randint(low=i*chunk_shape[1]+5, high=(i+1)*chunk_shape[1]+5)
    
    xes.append(i*chunk_shape[0]+50)
    ys.append(i*chunk_shape[1]+50)
    
    # "Same" point across all chunks

#     rand_indices_whole_array.append((x,y))
    
    # 1D access - makes indexing complicated
#     rand_indices_whole_array.append(np.random.randint(low=i*chunk_shape[0], high=(i+1)*chunk_shape[1]))

rand_indices_whole_array = list(product(xes, ys))
rand_indices_whole_array

[(50, 50),
 (50, 562),
 (50, 1074),
 (50, 1586),
 (50, 2098),
 (50, 2610),
 (50, 3122),
 (50, 3634),
 (50, 4146),
 (50, 4658),
 (50, 5170),
 (50, 5682),
 (50, 6194),
 (50, 6706),
 (50, 7218),
 (50, 7730),
 (50, 8242),
 (50, 8754),
 (50, 9266),
 (50, 9778),
 (50, 10290),
 (50, 10802),
 (50, 11314),
 (50, 11826),
 (50, 12338),
 (50, 12850),
 (50, 13362),
 (50, 13874),
 (50, 14386),
 (50, 14898),
 (50, 15410),
 (50, 15922),
 (2098, 50),
 (2098, 562),
 (2098, 1074),
 (2098, 1586),
 (2098, 2098),
 (2098, 2610),
 (2098, 3122),
 (2098, 3634),
 (2098, 4146),
 (2098, 4658),
 (2098, 5170),
 (2098, 5682),
 (2098, 6194),
 (2098, 6706),
 (2098, 7218),
 (2098, 7730),
 (2098, 8242),
 (2098, 8754),
 (2098, 9266),
 (2098, 9778),
 (2098, 10290),
 (2098, 10802),
 (2098, 11314),
 (2098, 11826),
 (2098, 12338),
 (2098, 12850),
 (2098, 13362),
 (2098, 13874),
 (2098, 14386),
 (2098, 14898),
 (2098, 15410),
 (2098, 15922),
 (4146, 50),
 (4146, 562),
 (4146, 1074),
 (4146, 1586),
 (4146, 2098),
 (4146, 2610),

In [82]:
len(rand_indices_whole_array) # Should equal n_chunks

1024

In [83]:
%%timeit
# Read randomly from whole array, randomly 1 point from each chunk

for tup in rand_indices_whole_array:
    load_partial_decomp[tup[0], tup[1]]

898 ms ± 69.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [84]:
%%timeit
# Read randomly from whole array, randomly 1 point from each chunk
# TODO Don't know if my indexing is correct!!

for tup in rand_indices_whole_array:
    load_disk_normally[tup[0], tup[1]]

1.34 s ± 11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [85]:
print("Speedup: ", 1.34 / .9)

Speedup:  1.488888888888889


### <font color="orange">Let's test advanced mask indexing</font>

In [86]:
# Create a 0-1 boolean mask with size same as Whole array
#    Don't know how to do just 1 chunk masking
bool_mask = np.random.randint(low=0, high=2, size=original_shape, dtype=bool)
bool_mask

array([[ True,  True, False, ...,  True,  True, False],
       [ True, False,  True, ...,  True,  True,  True],
       [ True, False, False, ...,  True,  True,  True],
       ...,
       [False,  True, False, ..., False, False,  True],
       [ True,  True,  True, ...,  True, False, False],
       [ True,  True,  True, ...,  True, False,  True]])

In [None]:
%%timeit
# Partial decompress

# _ to prevent printing
_ = load_partial_decomp.get_mask_selection(bool_mask)

In [None]:
%%timeit
# NO Partial decompress

# _ to prevent printing
_ = load_disk_normally.get_mask_selection(bool_mask)

#### <font color="red">Trying bigger chunks. partial_decompress Benefits should be bigger for bigger chunks</font>

In [17]:
!ls

[34mexample-zarr[m[m         [34mtutorials[m[m            [34mzarr_python[m[m
[34mexample.zarr[m[m         zarr-explore-1.ipynb


In [None]:
# # Check if reading from file == original array
# import numpy as np

# np.all(z[:] == z2[:])

In [18]:
# Blosc is one of the preconditions of partial_decompress
# https://zarr.readthedocs.io/en/stable/api/core.html
from numcodecs import Blosc

bloccc = Blosc(clevel=9)

In [19]:
z_compressed = zarr.array(zarr_index_list, compressor = bloccc)

In [20]:
z_compressed.compressor

Blosc(cname='lz4', clevel=9, shuffle=SHUFFLE, blocksize=0)

In [21]:
z_compressed

<zarr.core.Array (10000, 10000) int64>

In [22]:
zarr.save('./example.zarr', z_compressed)

In [15]:
z2



KeyboardInterrupt

