## To run this notebook correctly, make sure the main version of zarr is installed!! Not the Sharding Pull Request

In [1]:
import zarr # Check version
zarr.__version__

'2.13.0a2.dev42+dirty'

Zarr Sharding pull request should show '0.0.0'

If it says e.g. 2.13.2, then you're running on the "proper" published version (without Sharding support as of 28-Sept-2022)

### 1. I wanna test partial_decompression by accessing a random index

In [17]:
import numpy as np


numpy_arr = np.empty((65536, 16384), dtype=np.int32) # ~3GB
original_shape = numpy_arr.shape

# 50k x 150k int32 is limit of M1 Pro 32GB
#
for i in range(numpy_arr.shape[0]):
#     numpy_arr[i][:] = list(range(original_shape[1]*i, original_shape[1]*(i+1)))
    numpy_arr[i][:] = np.random.randint(low=0, high=original_shape[0], size=original_shape[1])

In [18]:
print("Memory used: ", numpy_arr.nbytes / 1024 / 1024 / 1024, " GB")

Memory used:  4.0  GB


In [19]:
numpy_arr[0][0:10]

array([62127,  5850, 35776, 28503, 52751, 47880,  8465, 34995,   920,
       10579], dtype=int32)

In [20]:
# Let's make chunks really large to test partial decompression

# So Let's use a weak compressor
from numcodecs import Blosc
weak_compressor = Blosc(cname='lz4', clevel=1, shuffle=Blosc.NOSHUFFLE)

chunk_size=(4096,4096)

# Reached max chunk size
zarr_index_list = zarr.array(numpy_arr, compressor=weak_compressor, chunks=chunk_size, partial_decompress=True, dtype='i4')
# z = zarr.zeros((10000,10000), chunks=(1000,1000), dtype='i4')

In [21]:
zarr_index_list.info

0,1
Type,zarr.core.Array
Data type,int32
Shape,"(65536, 16384)"
Chunk shape,"(4096, 4096)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=1, shuffle=NOSHUFFLE, blocksize=0)"
Store type,zarr.storage.KVStore
No. bytes,4294967296 (4.0G)
No. bytes stored,4294968665 (4.0G)


<font color="red">Not sure why it's ignoring partial_decompress. Maybe it just works on open()/read()</font>

In [22]:
del numpy_arr # Don't need this anymore - save 3GB

In [8]:
# # example-zarr is actually a folder
# # z1 = zarr.open('./example-zarr', mode='w', shape=original_shape, chunks=(10000,10000), dtype='f8', partial_decompress=True)
# zarr.save('./example-zarr', zarr_index_list)
# # open() doesn't actually write anything, hence it being so fast

In [41]:
# USE FSSTORE IF YOU WANT PARTIAL_DECOMPRESS TO WORK!!

fsstore = zarr.storage.FSStore('./fsstore4-zarr')
z = zarr.create(store=fsstore, shape=original_shape, dtype='f8')

In [42]:
# zarr.save('./example-zarr', zarr_index_list)
zarr.save(fsstore, zarr_index_list)

In [44]:
# Read array from disk
load_disk_partial_decompress = zarr.open(fsstore, mode="r", partial_decompress=True)
load_disk_partial_decompress.info

0,1
Type,zarr.core.Array
Data type,int32
Shape,"(65536, 16384)"
Chunk shape,"(4096, 4096)"
Order,C
Read-only,True
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.FSStore
No. bytes,4294967296 (4.0G)
No. bytes stored,2156234073 (2.0G)


<font color="orange"> Notice how compression changes, much higher Ratio. Chunks <10MB in size now</font>

Sometimes chunks not even initialized? Sometimes 2 arrays are written to disk (i.e. a group). Not sure why

In [30]:
# # Read it properly, without partial_decompressing chunks
# # load_disk_normally = zarr.open("./example-zarr", mode="r", partial_decompress=False)
# load_disk_normally = zarr.open(fsstore, mode="r", partial_decompress=False)
# load_disk_normally.info

In [45]:
# Read it properly, without partial_decompressing chunks
# load_disk_normally = zarr.open("./example-zarr", mode="r", partial_decompress=False)
load_disk_normally = zarr.open(fsstore, mode="r", partial_decompress=False)
load_disk_normally.info

0,1
Type,zarr.core.Array
Data type,int32
Shape,"(65536, 16384)"
Chunk shape,"(4096, 4096)"
Order,C
Read-only,True
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.FSStore
No. bytes,4294967296 (4.0G)
No. bytes stored,2156234073 (2.0G)


Both look identical so far

In [46]:
load_disk_partial_decompress[0, 100]

22847

In [47]:
load_disk_normally[0, 100]

22847

^^^ Should be 100

### Let's test random access of chunks by index

In [48]:
rand_indices = np.random.randint(low=0, high=original_shape[0], size=50)

In [52]:
%%timeit
# Read randomly from first chunk, Using partial decompression

for index in rand_indices:
    load_disk_partial_decompress[0, index]

627 ms ± 4.13 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [53]:
%%timeit
# Read randomly from first chunk, Not using partial decompression

for index in rand_indices:
    load_disk_normally[0, index]

629 ms ± 8.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## <font color="orange"> New experiment: access one point per chunk in the whole array</font>

In [60]:
# TODO get this programmatically
nr_chunks = 64

In [63]:
rand_indices_whole_array = []

for i in range(nr_chunks):
    rand_indices_whole_array.append(np.random.randint(low=i*chunk_size[0], high=(i+1)*chunk_size[1]))

rand_indices_whole_array = np.array(rand_indices_whole_array)
rand_indices_whole_array

array([     0,   7334,  10148,  13658,  17305,  21667,  24862,  31072,
        35241,  38359,  44806,  46999,  52829,  55998,  57702,  63836,
        66686,  70183,  77228,  81751,  82734,  89051,  93620,  96894,
        99899, 103138, 107660, 111667, 117444, 121965, 125533, 127237,
       131702, 138435, 141282, 143504, 149455, 152804, 158614, 162882,
       164650, 169870, 173882, 178925, 181082, 186865, 189810, 196288,
       198489, 203189, 205845, 212636, 213285, 219258, 225189, 228728,
       232427, 236474, 240707, 241861, 246274, 252591, 256284, 259575])

In [65]:
rand_indices_whole = np.random.randint(low=0, high=chunk_size[0], size=nr_chunks)
rand_indices_whole

array([ 792, 2037, 3291, 2877, 3215,   21, 3450,  208, 1727,   29, 1506,
       1855,  174, 1496, 3751,  231,  362, 1179, 4048, 1935, 2327, 1489,
       2639, 1390, 2538, 1744, 3582, 2238, 3612, 3689, 1619, 3187,  420,
       1711, 3145, 2365, 3797, 1469, 1027, 3090, 1266, 3858,  862, 1570,
       2467, 1806, 3116,  802, 1626, 3525, 3440, 3368, 3229,  294, 2124,
       2115, 1536,  501, 3498,  404, 1021, 1128, 3117, 3878])

In [68]:
%%timeit
# Read randomly from whole array, randomly 1 point from each chunk
# TODO Don't know if my indexing is correct!!

for index in rand_indices_whole_array:
    load_disk_partial_decompress[int(index / original_shape[1]), index % original_shape[1]]

799 ms ± 2.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Let's test advanced mask indexing

In [54]:
# Create a 0-1 boolean mask with size same as Whole array
#    Don't know how to do just 1 chunk masking
bool_mask = np.random.randint(low=0, high=2, size=original_shape, dtype=bool)
bool_mask

array([[ True, False, False, ...,  True, False,  True],
       [ True,  True,  True, ..., False, False,  True],
       [False,  True, False, ..., False,  True, False],
       ...,
       [ True,  True, False, ..., False,  True, False],
       [ True,  True, False, ..., False,  True, False],
       [ True,  True, False, ..., False, False, False]])

In [55]:
%%timeit
# Partial decompress

# _ to prevent printing
_ = load_disk_partial_decompress.get_mask_selection(bool_mask)

KeyboardInterrupt: 

In [None]:
%%timeit
# NO Partial decompress

# _ to prevent printing
_ = load_disk_normally.get_mask_selection(bool_mask)

#### <font color="orange">Trying Compressors. Irrelevant</font>

In [17]:
!ls

[34mexample-zarr[m[m         [34mtutorials[m[m            [34mzarr_python[m[m
[34mexample.zarr[m[m         zarr-explore-1.ipynb


In [None]:
# # Check if reading from file == original array
# import numpy as np

# np.all(z[:] == z2[:])

In [18]:
# Blosc is one of the preconditions of partial_decompress
# https://zarr.readthedocs.io/en/stable/api/core.html
from numcodecs import Blosc

bloccc = Blosc(clevel=9)

In [19]:
z_compressed = zarr.array(zarr_index_list, compressor = bloccc)

In [20]:
z_compressed.compressor

Blosc(cname='lz4', clevel=9, shuffle=SHUFFLE, blocksize=0)

In [21]:
z_compressed

<zarr.core.Array (10000, 10000) int64>

In [22]:
zarr.save('./example.zarr', z_compressed)

In [15]:
z2



KeyboardInterrupt

