In [15]:
import zarr # Check version
zarr.__version__

'2.13.2'

Zarr Sharding pull request should show '0.0.0'

If it says e.g. 2.13.2, then you're running on the "proper" published version (without Sharding support as of 28-Sept-2022)

### 1. I wanna test partial_decompression by accessing a random index

In [16]:
import numpy as np


numpy_arr = np.empty((66600, 15000), dtype=np.uint32) # ~3GB
original_shape = numpy_arr.shape

# 50k x 150k int32 is limit of M1 Pro 32GB
#
for i in range(numpy_arr.shape[0]):
    numpy_arr[i][:] = list(range(original_shape[1]*i, original_shape[1]*(i+1)))

In [17]:
print("Memory used: ", numpy_arr.nbytes / 1024 / 1024 / 1024, " GB")

Memory used:  3.721565008163452  GB


In [18]:
numpy_arr[0][0:10]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint32)

In [23]:
# Let's make chunks really large to test partial decompression

# So Let's use a weak compressor
from numcodecs import Blosc
weak_compressor = Blosc(cname='lz4', clevel=1, shuffle=Blosc.NOSHUFFLE)

chunk_size=(15000,15000)

# Reached max chunk size
zarr_index_list = zarr.array(numpy_arr, compressor=weak_compressor, chunks=chunk_size, partial_decompress=True, dtype='f8')
# z = zarr.zeros((10000,10000), chunks=(1000,1000), dtype='i4')

zarr_index_list.info

0,1
Type,zarr.core.Array
Data type,float64
Shape,"(66600, 15000)"
Chunk shape,"(15000, 15000)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=1, shuffle=NOSHUFFLE, blocksize=0)"
Store type,zarr.storage.KVStore
No. bytes,7992000000 (7.4G)
No. bytes stored,4035852693 (3.8G)


<font color="red">Not sure why it's ignoring partial_decompress. Maybe it just works on open()/read()</font>

In [20]:
# example-zarr is actually a folder
# z1 = zarr.open('./example-zarr', mode='w', shape=original_shape, chunks=(15000,15000), dtype='f8', partial_decompress=True)
zarr.save('./example-zarr', zarr_index_list)
# Needs both open and save() I Think
# open() clears previous, save() writes new

In [21]:
# # Read from array
zarr_read_disk = zarr.open("./example-zarr", mode="r")
zarr_read_disk.info

0,1
Type,zarr.core.Array
Data type,float64
Shape,"(66600, 15000)"
Chunk shape,"(15000, 15000)"
Order,C
Read-only,True
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,7992000000 (7.4G)
No. bytes stored,51661593 (49.3M)


<font color="orange"> Notice how compression changes, much higher Ratio. Chunks <10MB in size now</font>

Sometimes chunks not even initialized?

In [22]:
zarr_read_disk[0, 100]

100.0

### Let's test random access of chunks

In [None]:
rand_indices = np.random.randint(low=0, high=15000, size=)

In [17]:
!ls

[34mexample-zarr[m[m         [34mtutorials[m[m            [34mzarr_python[m[m
[34mexample.zarr[m[m         zarr-explore-1.ipynb


In [None]:
# # Check if reading from file == original array
# import numpy as np

# np.all(z[:] == z2[:])

#### <font color="orange">Trying Compressors. Irrelevant</font>

In [18]:
# Blosc is one of the preconditions of partial_decompress
# https://zarr.readthedocs.io/en/stable/api/core.html
from numcodecs import Blosc

bloccc = Blosc(clevel=9)

In [19]:
z_compressed = zarr.array(zarr_index_list, compressor = bloccc)

In [20]:
z_compressed.compressor

Blosc(cname='lz4', clevel=9, shuffle=SHUFFLE, blocksize=0)

In [21]:
z_compressed

<zarr.core.Array (10000, 10000) int64>

In [22]:
zarr.save('./example.zarr', z_compressed)

In [15]:
z2



KeyboardInterrupt

