In [15]:
import zarr # Check version
zarr.__version__

'2.13.2'

Zarr Sharding pull request should show '0.0.0'

If it says e.g. 2.13.2, then you're running on the "proper" published version (without Sharding support as of 28-Sept-2022)

### 1. I wanna test partial_decompression by accessing a random index

In [51]:
import numpy as np


numpy_arr = np.empty((60000, 15000), dtype=np.uint32) # ~3GB
original_shape = numpy_arr.shape

# 50k x 150k int32 is limit of M1 Pro 32GB
#
for i in range(numpy_arr.shape[0]):
    numpy_arr[i][:] = list(range(original_shape[1]*i, original_shape[1]*(i+1)))

In [52]:
print("Memory used: ", numpy_arr.nbytes / 1024 / 1024 / 1024, " GB")

Memory used:  3.3527612686157227  GB


In [53]:
numpy_arr[0][0:10]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint32)

In [54]:
# Let's make chunks really large to test partial decompression

# So Let's use a weak compressor
from numcodecs import Blosc
weak_compressor = Blosc(cname='lz4', clevel=1, shuffle=Blosc.NOSHUFFLE)

chunk_size=(15000,15000)

# Reached max chunk size
zarr_index_list = zarr.array(numpy_arr, compressor=weak_compressor, chunks=chunk_size, partial_decompress=True, dtype='f8')
# z = zarr.zeros((10000,10000), chunks=(1000,1000), dtype='i4')

In [55]:
zarr_index_list.info

0,1
Type,zarr.core.Array
Data type,float64
Shape,"(60000, 15000)"
Chunk shape,"(15000, 15000)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=1, shuffle=NOSHUFFLE, blocksize=0)"
Store type,zarr.storage.KVStore
No. bytes,7200000000 (6.7G)
No. bytes stored,3630779699 (3.4G)


<font color="red">Not sure why it's ignoring partial_decompress. Maybe it just works on open()/read()</font>

In [77]:
del numpy_arr # Don't need this anymore - save 3GB

In [56]:
# example-zarr is actually a folder
# z1 = zarr.open('./example-zarr', mode='w', shape=original_shape, chunks=(10000,10000), dtype='f8', partial_decompress=True)
zarr.save('./example-zarr', zarr_index_list)
# open() doesn't actually write anything, hence it being so fast

In [60]:
# Read array from disk
load_disk_partial_decompress = zarr.open("./example-zarr", mode="r", partial_decompress=True)
load_disk_partial_decompress.info

0,1
Type,zarr.core.Array
Data type,float64
Shape,"(60000, 15000)"
Chunk shape,"(15000, 15000)"
Order,C
Read-only,True
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,7200000000 (6.7G)
No. bytes stored,43240971 (41.2M)


<font color="orange"> Notice how compression changes, much higher Ratio. Chunks <10MB in size now</font>

Sometimes chunks not even initialized? Sometimes 2 arrays are written to disk (i.e. a group). Not sure why

In [61]:
# Read it properly, without partial_decompressing chunks
load_disk_normally = zarr.open("./example-zarr", mode="r", partial_decompress=False)
load_disk_normally.info

0,1
Type,zarr.core.Array
Data type,float64
Shape,"(60000, 15000)"
Chunk shape,"(15000, 15000)"
Order,C
Read-only,True
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,7200000000 (6.7G)
No. bytes stored,43240971 (41.2M)


Both look identical so far

In [62]:
load_disk_partial_decompress[0, 100]

100.0

In [63]:
load_disk_normally[0, 100]

100.0

^^^ Should be 100

### Let's test random access of chunks by index

In [66]:
rand_indices = np.random.randint(low=0, high=chunk_size[0], size=50)

In [67]:
%%timeit
# Read randomly from first chunk, Using partial decompression

for index in rand_indices:
    load_disk_partial_decompress[0, index]

9.79 s ± 35.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [68]:
%%timeit
# Read randomly from first chunk, Not using partial decompression

for index in rand_indices:
    load_disk_normally[0, index]

9.79 s ± 70.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Let's test advanced mask indexing

In [73]:
# Create a 0-1 boolean mask with size same as Whole array
#    Don't know how to do just 1 chunk masking
bool_mask = np.random.randint(low=0, high=2, size=original_shape, dtype=bool)
bool_mask

array([[ True, False, False, ...,  True,  True, False],
       [ True,  True,  True, ...,  True, False,  True],
       [ True,  True, False, ...,  True,  True, False],
       ...,
       [False,  True,  True, ...,  True, False, False],
       [False, False,  True, ..., False, False, False],
       [ True, False, False, ..., False, False,  True]])

In [75]:
%%timeit
# Partial decompress

# _ to prevent printing
_ = load_disk_partial_decompress.get_mask_selection(bool_mask)

26.3 s ± 342 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [76]:
%%timeit
# NO Partial decompress

# _ to prevent printing
_ = load_disk_normally.get_mask_selection(bool_mask)

26.5 s ± 245 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### <font color="orange">Trying Compressors. Irrelevant</font>

In [17]:
!ls

[34mexample-zarr[m[m         [34mtutorials[m[m            [34mzarr_python[m[m
[34mexample.zarr[m[m         zarr-explore-1.ipynb


In [None]:
# # Check if reading from file == original array
# import numpy as np

# np.all(z[:] == z2[:])

In [18]:
# Blosc is one of the preconditions of partial_decompress
# https://zarr.readthedocs.io/en/stable/api/core.html
from numcodecs import Blosc

bloccc = Blosc(clevel=9)

In [19]:
z_compressed = zarr.array(zarr_index_list, compressor = bloccc)

In [20]:
z_compressed.compressor

Blosc(cname='lz4', clevel=9, shuffle=SHUFFLE, blocksize=0)

In [21]:
z_compressed

<zarr.core.Array (10000, 10000) int64>

In [22]:
zarr.save('./example.zarr', z_compressed)

In [15]:
z2



KeyboardInterrupt

