This Notebook uses Zarr's Partial Decompress feature. Zarr proposed another layer in their storage hierarchy with Sharding. See corresponding notebook

## Found how to get partial_decompress=True from zarr_python library code, file "build/lib/zarr/tests/test_core.py" test function test_read_nitems_less_than_blocksize_from_multiple_chunks in line 2683

In [None]:
import zarr # Check version
zarr.__version__

Zarr Sharding pull request should show '0.0.0'

If it says e.g. 2.13.2, then you're running on the "proper" published version (without Sharding support as of 28-Sept-2022)

<font color="orange">partial_decompress works on current version 2.13, no need for Sharding</font>

### 1. I wanna test partial_decompression by accessing a random index

In [None]:
import numpy as np


numpy_arr = np.empty((65536, 16384), dtype=np.int32) # ~3GB
original_shape = numpy_arr.shape

# 50k x 150k int32 is limit of M1 Pro 32GB
for i in range(numpy_arr.shape[0]):
#     numpy_arr[i][:] = list(range(original_shape[1]*i, original_shape[1]*(i+1)))
    numpy_arr[i][:] = np.random.randint(low=0, high=original_shape[0], size=original_shape[1])

In [None]:
print("Memory used: ", numpy_arr.nbytes / 1024 / 1024 / 1024, " GB")

In [None]:
numpy_arr[0][0:10]

In [None]:
fsstore = zarr.storage.FSStore('./fsstore256MBChunk')
zarr_index_list = zarr.array(numpy_arr, store=fsstore, chunks=chunk_shape)
zarr.save(fsstore, numpy_arr)

### Following code in zarr test_core.py:2653

In [None]:
# All these extra parameters are irrelevant to _partial_decompress being enabled. Path is enough
# fsstore = zarr.storage.FSStore('./fsstore', key_separator="/", auto_mkdir=True, mode='w', normalize_keys=False)

fsstore = zarr.storage.FSStore('./fsstore4MB')

In [None]:
# chunk_size=(4096,4096)

zarr_index_list = zarr.array(numpy_arr, store=fsstore)#, chunk_store = fsstore_chunk, partial_decompress=True, chunks=chunk_size, dtype='i4')# compressor=weak_compressor, 

In [None]:
zarr_index_list.info

In [None]:
print("_partial_decompress: ", zarr_index_list._partial_decompress)

<font color="orange">As we can see, partial_decompress is False. Don't panic!</font>

In [None]:
# Flush to disk
zarr.save(fsstore, numpy_arr)

In [None]:
# Load from disk

# read_only doesn't affect partial_decompress eligibility
load_partial_decomp = zarr.Array(fsstore, read_only=True, partial_decompress=True)

In [None]:
load_partial_decomp._partial_decompress

#### <font color="orange">Solved! Partial_decompress=True</font>

In [None]:
load_partial_decomp.info

In [None]:
# del numpy_arr # Don't need this anymore - save 3GB

Sometimes chunks not even initialized? Sometimes 2 arrays are written to disk (i.e. a group). Not sure why

In [None]:
load_partial_decomp[0, 100:105]

In [None]:
load_disk_normally = zarr.open(fsstore) # , partial_decompress=True) -- This won't work !!

In [None]:
load_disk_normally._partial_decompress

In [None]:
load_disk_normally[0, 100:105]

In [None]:
# Do the two match?
(load_partial_decomp[0, 100:105] == load_disk_normally[0, 100:105]).all()

### Let's test random access of chunks by index

In [None]:
chunk_shape = (2048, 512)

In [None]:
rand_indices_list = np.random.randint(low=0, high=chunk_shape[0], size=50)

In [None]:
%%timeit
# Read randomly from first chunk, Using partial decompression

for index in rand_indices_list:
    load_partial_decomp[0, index]

In [None]:
%%timeit
# Read randomly from first chunk, Not using partial decompression

for index in rand_indices_list:
    load_disk_normally[0, index]

#### <font color="orange">Let's access the chunks using 2D accesses now</font>

In [None]:
rand_indices_tuples = (np.random.randint(low=0, high=chunk_shape[0], size=50), np.random.randint(low=0, high=chunk_shape[1], size=50))

In [None]:
rand_indices_tuples

In [None]:
rand_indices_tuples[0][1]

TODO set range(50) programmatically

In [None]:
%%timeit
# Read randomly from first chunk, Not using partial decompression

for i in range(50):
    load_partial_decomp[rand_indices_tuples[0][i], rand_indices_tuples[1][i]]

In [None]:
%%timeit
# Read randomly from first chunk, Not using partial decompression

for i in range(50):
    load_disk_normally[rand_indices_tuples[0][i], rand_indices_tuples[1][i]]

## <font color="orange"> New experiment: access one point per chunk in the whole array</font>

In [None]:
load_partial_decomp.info

In [None]:
32**2

In [None]:
from itertools import product

In [None]:
# rand_indices_whole_array = []

xes = []
ys = []

for i in range(32): # there's 32x32 chunks. TODO get programmatically
    # 2d point access - let's not just access first row of chunk
    
    # Random point across all chunks
#     x = np.random.randint(low=i*chunk_shape[0]+5, high=(i+1)*chunk_shape[0] + 5)
#     y = np.random.randint(low=i*chunk_shape[1]+5, high=(i+1)*chunk_shape[1]+5)
    
    xes.append(i*chunk_shape[0]+50)
    ys.append(i*chunk_shape[1]+50)
    
    # "Same" point across all chunks

#     rand_indices_whole_array.append((x,y))
    
    # 1D access - makes indexing complicated
#     rand_indices_whole_array.append(np.random.randint(low=i*chunk_shape[0], high=(i+1)*chunk_shape[1]))

rand_indices_whole_array = list(product(xes, ys))
rand_indices_whole_array

In [None]:
len(rand_indices_whole_array) # Should equal n_chunks

In [None]:
%%timeit
# Read randomly from whole array, randomly 1 point from each chunk

for tup in rand_indices_whole_array:
    load_partial_decomp[tup[0], tup[1]]

In [None]:
%%timeit
# Read randomly from whole array, randomly 1 point from each chunk
# TODO Don't know if my indexing is correct!!

for tup in rand_indices_whole_array:
    load_disk_normally[tup[0], tup[1]]

In [None]:
print("Speedup: ", 1.34 / .9)

### <font color="orange">Let's test advanced mask indexing</font>

In [None]:
# Create a 0-1 boolean mask with size same as Whole array
#    Don't know how to do just 1 chunk masking
bool_mask = np.random.randint(low=0, high=2, size=original_shape, dtype=bool)
bool_mask

In [None]:
%%timeit
# Partial decompress

# _ to prevent printing
_ = load_partial_decomp.get_mask_selection(bool_mask)

In [None]:
%%timeit
# NO Partial decompress

# _ to prevent printing
_ = load_disk_normally.get_mask_selection(bool_mask)

## <font color="red">Trying 16MB chunks. partial_decompress Benefits should be bigger for bigger chunks</font>

In [None]:
fsstore_8mb = zarr.storage.FSStore("./fsstore16MBChunk")

In [None]:
chunk_shape = (4096,1024) #16MB

In [None]:
zarr_index_list_16MB_Chunk = zarr.array(numpy_arr, store=fsstore_8mb, chunks = chunk_size_8mb)

In [None]:
zarr.save(fsstore_16mb, zarr_index_list_8MB_Chunk)

In [None]:
load_partial_decomp_16mb = zarr.Array(fsstore_16mb, partial_decompress=True)

In [None]:
load_disk_normally_16mb = zarr.Array(fsstore_16mb, partial_decompress=False)

In [None]:
rand_indices_tuples = (np.random.randint(low=0, high=chunk_shape[0], size=50), np.random.randint(low=0, high=chunk_shape[1], size=50))

In [None]:
%%timeit
# Read randomly from first chunk, Not using partial decompression

for i in range(50):
    load_partial_decomp_16mb[rand_indices_tuples[0][i], rand_indices_tuples[1][i]]

In [None]:
%%timeit
# Read randomly from first chunk, Not using partial decompression

for i in range(50):
    load_disk_normally_16mb[rand_indices_tuples[0][i], rand_indices_tuples[1][i]]

## <font color="red">64 MB chunks</font>

In [None]:
fsstore = zarr.storage.FSStore("./fsstore64MBChunk")
chunk_shape = (8192,2048)
zarr_index_list = zarr.array(numpy_arr, store=fsstore, chunks = chunk_shape)
zarr.save(fsstore, zarr_index_list)
load_partial_decomp = zarr.Array(fsstore, partial_decompress=True)
load_disk_normally = zarr.Array(fsstore, partial_decompress=False)
rand_indices_tuples = (np.random.randint(low=0, high=chunk_shape[0], size=50), np.random.randint(low=0, high=chunk_shape[1], size=50))

In [None]:
%%timeit
# Read randomly from first chunk, Not using partial decompression

for i in range(50):
    load_partial_decomp[rand_indices_tuples[0][i], rand_indices_tuples[1][i]]

In [None]:
%%timeit
# Read randomly from first chunk, Not using partial decompression

for i in range(50):
    load_disk_normally[rand_indices_tuples[0][i], rand_indices_tuples[1][i]]

## <font color="red">256MB Chunk</font>

In [None]:
fsstore = zarr.storage.FSStore("./fsstore256MBChunk")
chunk_shape = (16384,4096)
zarr_index_list = zarr.array(numpy_arr, store=fsstore, chunks = chunk_shape)
zarr.save(fsstore, zarr_index_list)
load_partial_decomp = zarr.Array(fsstore, partial_decompress=True)
load_disk_normally = zarr.Array(fsstore, partial_decompress=False)
rand_indices_tuples = (np.random.randint(low=0, high=chunk_shape[0], size=50), np.random.randint(low=0, high=chunk_shape[1], size=50))

In [None]:
fsstore = zarr.storage.FSStore("./fsstore256MBChunk")

In [None]:
load_partial_decomp = zarr.Array(fsstore, partial_decompress=True)

In [None]:
load_partial_decomp._partial_decompress

In [None]:
%%timeit
# Read randomly from first chunk, Not using partial decompression

for i in range(50):
    load_partial_decomp[rand_indices_tuples[0][i], rand_indices_tuples[1][i]]

In [None]:
%%timeit
# Read randomly from first chunk, Not using partial decompression

for i in range(50):
    load_disk_normally[rand_indices_tuples[0][i], rand_indices_tuples[1][i]]

## <font color="red">1GB Chunk - write then read</font>

In [None]:
fsstore = zarr.storage.FSStore("./fsstore1GBChunk")
chunk_shape = (32768,8192)
zarr_index_list = zarr.array(numpy_arr, store=fsstore, chunks = chunk_shape)
zarr.save(fsstore, zarr_index_list)
load_partial_decomp = zarr.Array(fsstore, partial_decompress=True)
load_disk_normally = zarr.Array(fsstore, partial_decompress=False)
rand_indices_tuples = (np.random.randint(low=0, high=chunk_shape[0], size=50), np.random.randint(low=0, high=chunk_shape[1], size=50))

In [None]:
%%timeit
# Read randomly from first chunk

for i in range(50):
    load_partial_decomp[rand_indices_tuples[0][i], rand_indices_tuples[1][i]]

In [None]:
%%timeit
# Read randomly from first chunk, Not using partial decompression

for i in range(50):
    load_disk_normally[rand_indices_tuples[0][i], rand_indices_tuples[1][i]]

In [None]:
xes = []
ys = []

for i in range(2): # N chunks
    xes.append(i*chunk_shape[0]+50)
    ys.append(i*chunk_shape[1]+50)
    

rand_indices_whole_array = list(product(xes, ys))
rand_indices_whole_array

In [None]:
%%timeit
# Read randomly from whole array, randomly 1 point from each chunk

for tup in rand_indices_whole_array:
    load_partial_decomp[tup[0], tup[1]]

In [None]:
%%timeit
# Read randomly from whole array, randomly 1 point from each chunk

for tup in rand_indices_whole_array:
    load_disk_normally[tup[0], tup[1]]

### <font color="red">1. Run for longer than 50, see if serializing only once the big chunk, or every time you index it


2. Figure out decompression unit size. Experiment: Access ranges: 1K, 4K, 16K, 64K - 1M - 16M
</font>

3. Can we change compression size? & tradeoffs w/ different sizes


3. Crossover betw. one at a time IO and loading whole chunks & indexing points. See how many points you need to index for partial_decompress to be slower


4. Mask with sparse, p=0.05

# New Experiments - just read existing data (don't create new FSStores)

In [None]:
import numpy as np
import zarr

# Define data location
fsstore = zarr.storage.FSStore("./fsstore256MBChunk")
# Load data - partial_decompress and full decompress
load_partial_decomp = zarr.Array(fsstore, partial_decompress=True)
load_disk_normally = zarr.Array(fsstore, partial_decompress=False)

In [None]:
load_partial_decomp._partial_decompress

In [None]:
load_partial_decomp.info

<font color="orange">Need the above to determine chunk shape. Don't know how to get that programmatically yet</font>

In [None]:
chunk_shape = (16384,4096)

### Experiment 1: run for longer access (>50)

In [None]:
# Create Random indices to test access speed

index_lengths = range(50,2000,50) # [50, 100, 150 ... 2000]

indices_various_len = []
for ind_len in index_lengths:
    indices_various_len.append(np.array((np.random.randint(low=0, high=chunk_shape[0], size=ind_len), np.random.randint(low=0, high=chunk_shape[1], size=ind_len))).T)

In [None]:
def access_random_indices_chunk(i, loaded_array):
    # Random access of various points in 1 Zarr array chunk
    # This shows the cutoff of how many points-per-chunk makes using
    # partial_decompress=True advantagous
    # Moved to separate function to make %timeit easy
    #
    # Args:
    #    i - how many random accesses to do per chunk (chosen from indices_various_len)
    #    loaded_array - zarr array to access elements from
    test_indices = indices_various_len[i] # For readability
    index_length = indices_various_len[i].shape[0]
    
    for j in range(index_length):
        loaded_array[test_indices[j][0], test_indices[j][1]]

#### partial_decompress

In [None]:
partial_decomp_chunk_access = []

for i in range(len(index_lengths)):
    _ = %timeit -o access_random_indices_chunk(i, load_partial_decomp)
    partial_decomp_chunk_access.append(_)

#### no partial_decompress

In [None]:
no_decomp_chunk_access = []

for i in range(len(index_lengths[:15])): # Stopped at 15/40 since it's so slow
    _ = %timeit -o access_random_indices_chunk(i, load_disk_normally)
    no_decomp_chunk_access.append(_)

##### Pickle above results to file - let's not lose experiments

In [None]:
import pickle

partial_random_chunk_acc_file = open("partial_random_chunk_acc.pickle", 'wb')
normal_random_chunk_acc_file = open("normal_random_chunk_acc.pickle", "wb")

pickle.dump(partial_decomp_chunk_access, partial_random_chunk_acc_file)
pickle.dump(no_decomp_chunk_access, normal_random_chunk_acc_file)

In [None]:
partial_random_chunk_acc_file.close()
normal_random_chunk_acc_file.close()

### Experiment 2 - Stabbing queries

<font color="red"> Loading 4MB chunks. 256MB chunk leads to only 8 chunks</font>

In [None]:
import numpy as np
import zarr

# Define data location
fsstore = zarr.storage.FSStore("./fsstore4MB")
# Load data - partial_decompress and full decompress
load_partial_decomp = zarr.Array(fsstore, partial_decompress=True)
load_disk_normally = zarr.Array(fsstore, partial_decompress=False)

In [None]:
load_partial_decomp._partial_decompress

In [None]:
load_partial_decomp.info

In [None]:
chunk_shape = (2048, 512)

In [None]:
from itertools import product

xes = []
ys = []

for i in range(1, 31): # there's 32x32 chunks. TODO get programmatically
    # 2d point access - let's not just access first row of chunk
    
    xes.append(i*int(chunk_shape[0]/2))
    ys.append(i*int(chunk_shape[1]/2))
    

middle_indices_all_chunks = list(product(xes, ys))
middle_indices_all_chunks[:100]

In [None]:
x_dirs = [2**i for i in range(0, 11)] # [-128 : 128] type-access (i.e. either side of center), so multi. by 2
y_dirs = [int(x / 4) for x in x_dirs]

# Fix y to match x as much as possible. Leave top y-s at 2048 (i.e. whole chunk vertical length / 2)

y_dirs[0] = 256
y_dirs[1] = 256

# x_dirs.append(0)
# y_dirs.append(0)

x_dirs.sort()
y_dirs.sort()
print("x_dirs: ", x_dirs)
print("new y_dirs: ", y_dirs)

In [None]:
def stabbing_access(x_stab_size, y_stab_size, target_array):
    for tup in middle_indices_all_chunks:
#         print([tup[0]-x_stab_size, tup[0]+x_stab_size])
        target_array[tup[0]-x_stab_size:tup[0]+x_stab_size, tup[1]-y_stab_size:tup[1]+y_stab_size]

In [None]:
%%timeit
# Stab size of 1 - single point. Complicated indexing prevents me from putting it above

for tup in middle_indices_all_chunks:
    load_partial_decomp[tup[0], tup[1]]

In [None]:
stabbing_partial = [] # only 3 runs

for i in range(len(x_dirs)):
    _ = %timeit -r 3 -o stabbing_access(x_dirs[i], y_dirs[i], load_partial_decomp)
    stabbing_partial.append(_)

In [None]:
%%timeit
# Stab size of 1 - single point. Complicated indexing prevents me from putting it above

for tup in middle_indices_all_chunks:
    load_disk_normally[tup[0], tup[1]]

In [None]:
stabbing_normal = [] # only 3 runs

for i in range(len(x_dirs)):
    _ = %timeit -r 3 -o stabbing_access(x_dirs[i], y_dirs[i], load_disk_normally)
    stabbing_normal.append(_)

In [None]:
s_part2 = stabbing_partial.copy()

In [None]:
s_part2

In [None]:
# stabbing_partial = [] # only 3 runs

# for i in range(len(x_dirs)):
_ = %timeit -r 1 -o stabbing_access(1024, 256, load_partial_decomp)
stabbing_partial.append(_)

In [None]:
len(x_dirs)

In [None]:
x_dirs[10]

In [None]:
# Pickle

import pickle

stabbing_partial_file = open("stabbing_partial.pickle", 'wb')
stabbing_normal_file = open("stabbing_normal.pickle", "wb")

pickle.dump(stabbing_partial, stabbing_partial_file)
pickle.dump(stabbing_normal, stabbing_normal_file)

stabbing_partial_file.close()
stabbing_normal_file.close()

### Experiment 3 - Large Square In-Chunk Access

Idea: find how big the "partial" decompression piece is

In [None]:
# Load 256MB chunks afresh

import numpy as np
import zarr

# Define data location
fsstore = zarr.storage.FSStore("./fsstore256MBChunk")
# Load data - partial_decompress and full decompress
load_partial_decomp = zarr.Array(fsstore, partial_decompress=True)
load_disk_normally = zarr.Array(fsstore, partial_decompress=False)
chunk_shape = (16384, 4096)

In [None]:
load_disk_normally[0, 0:10]

In [None]:
access_sizes = [2**i for i in range(12, 27)] # 64 KB - 256MB sequential access
access_sizes

In [None]:
[int(a / chunk_shape[1]) for a in access_sizes]

In [None]:
decomp_sequential_access = []

for acc_size in access_sizes:
    _ = %timeit -o load_partial_decomp[0:int(acc_size / chunk_shape[1]), 0:4096]
    decomp_sequential_access.append(_)

In [None]:
no_decomp_sequential_access = []

for acc_size in access_sizes:
    _ = %timeit -o load_disk_normally[0:int(acc_size / chunk_shape[1]), 0:4096]
    no_decomp_sequential_access.append(_)

In [None]:
load_disk_normally[0:int(acc_size / chunk_shape[1]), 0:4096].shape

In [None]:
# Pickle above

import pickle

partial_sequential_chunk_acc_file = open("partial_sequential_chunk_acc.pickle", 'wb')
normal_sequential_chunk_acc_file = open("normal_sequential_chunk_acc.pickle", "wb")

pickle.dump(decomp_sequential_access, partial_sequential_chunk_acc_file)
pickle.dump(no_decomp_sequential_access, normal_sequential_chunk_acc_file)

partial_sequential_chunk_acc_file.close()
normal_sequential_chunk_acc_file.close()

### Experiment 1 Plotting - In-chunk Random access

In [None]:
exp1_partial_means = [x.average for x in partial_decomp_chunk_access]
exp1_partial_stds = [x.stdev for x in partial_decomp_chunk_access]

exp1_normal_means = [x.average for x in no_decomp_chunk_access]
exp1_normal_stds = [x.stdev for x in no_decomp_chunk_access]

In [None]:
partial_decomp_chunk_access

In [None]:
no_decomp_chunk_access

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,7))

plt.errorbar(index_lengths, exp1_partial_means, yerr=exp1_partial_stds, fmt='.', label="Partial")
plt.errorbar(index_lengths[0:15], exp1_normal_means, yerr=exp1_normal_stds, fmt='.', label="Full")

plt.xlabel("# Points accessed")
plt.ylabel("Time (s)")
plt.title("In-chunk Random access (for loop) - 256MB chunks")

plt.legend()
plt.show()

## Same experiment, multiple chunk sizes

#### Access 64KB - 1GB on arrays chunked in various sizes

In [None]:
# Load 4MB chunks afresh

import numpy as np
import zarr

# Define data location
fsstore = zarr.storage.FSStore("./fsstore4MB")
# Load data - partial_decompress and full decompress
load_partial_decomp = zarr.Array(fsstore, partial_decompress=True)
load_disk_normally = zarr.Array(fsstore, partial_decompress=False)
chunk_shape = (2048, 512)

In [None]:
load_disk_normally.info

In [None]:
load_disk_normally[0, 0:10]

In [None]:
access_sizes = [2**i for i in range(12, 21)] # 64 KB - 4MB sequential access. Remember int32-4bytes
access_sizes

In [None]:
[int(a / chunk_shape[1]) for a in access_sizes]

In [None]:
decomp_sequential_access = []

for acc_size in access_sizes:
    _ = %timeit -o load_partial_decomp[0:int(acc_size / chunk_shape[1]), 0:512]
    decomp_sequential_access.append(_)

In [None]:
no_decomp_sequential_access = []

for acc_size in access_sizes:
    _ = %timeit -o load_disk_normally[0:int(acc_size / chunk_shape[1]), 0:512]
    no_decomp_sequential_access.append(_)

In [None]:
# Pickle above

import pickle

partial_sequential_chunk_acc_file = open("partial_sequential_4MBchunk_acc.pickle", 'wb')
normal_sequential_chunk_acc_file = open("normal_sequential_4MBchunk_acc.pickle", "wb")

pickle.dump(decomp_sequential_access, partial_sequential_chunk_acc_file)
pickle.dump(no_decomp_sequential_access, normal_sequential_chunk_acc_file)

partial_sequential_chunk_acc_file.close()
normal_sequential_chunk_acc_file.close()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,7))

plt.errorbar(index_lengths, exp1_partial_means, yerr=exp1_partial_stds, fmt='.', label="Partial")
plt.errorbar(index_lengths[0:15], exp1_normal_means, yerr=exp1_normal_stds, fmt='.', label="Full")

plt.xlabel("# Points accessed")
plt.ylabel("Time (s)")
plt.title("In-chunk Random access (for loop) - 256MB chunks")

plt.legend()
plt.show()

### Experiment 2 Plotting - Stabbing across-chunk access

In [None]:
exp2_partial_means = [x.average for x in stabbing_partial]
exp2_partial_stds = [x.stdev for x in stabbing_partial]

exp2_normal_means = [x.average for x in stabbing_normal]
exp2_normal_stds = [x.stdev for x in stabbing_normal]

# Can't label x-axis with tuples

In [None]:
exp2_points_accessed = []

for i in range(len(x_dirs)):
    exp2_points_accessed.append(2*x_dirs[i] + 2*y_dirs[i])

exp2_points_accessed

In [None]:
stabbing_partial

In [None]:
stabbing_normal

In [None]:
exp2_points_accessed

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,7))

plt.errorbar(exp2_points_accessed[0:10], exp2_partial_means, yerr=exp2_partial_stds, fmt='.', label="Partial")
plt.errorbar(exp2_points_accessed, exp2_normal_means, yerr=exp2_normal_stds, fmt='.', label="Full")

plt.xlabel("# Points accessed")
plt.ylabel("Time (s)")
plt.title("Stabbing across-chunk access - 4MB chunks")

plt.legend()
plt.show()

### Experiment 3 Plotting - Large Square In-Chunk access

In [None]:
exp3_partial_means = [x.average for x in decomp_sequential_access]
exp3_partial_stds = [x.stdev for x in decomp_sequential_access]

exp3_normal_means = [x.average for x in no_decomp_sequential_access]
exp3_normal_stds = [x.stdev for x in no_decomp_sequential_access]

In [None]:
decomp_sequential_access

In [None]:
no_decomp_sequential_access

In [None]:
access_sizes

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,7))

plt.errorbar(access_sizes, exp3_partial_means, yerr=exp3_partial_stds, fmt='.', label="Partial")
plt.errorbar(access_sizes, exp3_normal_means, yerr=exp3_normal_stds, fmt='.', label="Full")

plt.xlabel("# Points accessed")
plt.ylabel("Time (s)")
plt.title("Large Square In-Chunk access - 256MB chunks")

plt.legend()
plt.show()

## Experiment 4 - Lagrangian 3D point access

In [None]:
import numpy as np

numpy_3d_arr = np.random.randint(low=0, high=1000, size=(16384, 4096, 80), dtype=np.int32) # 100 GB

In [None]:
np.info(numpy_3d_arr)

In [None]:
# This fails bcs. it runs out of memory. Cannot define dtype manually
# numpy_3d_arr = np.random.random(size=(65536, 16384, 25), dtype=np.float)

In [None]:
import zarr

fsstore = zarr.storage.FSStore('./fsstore3D')
zarr_index_list = zarr.array(numpy_3d_arr, store=fsstore)#, chunks=chunk_shape)
zarr.save(fsstore, numpy_3d_arr)

In [None]:
del numpy_3d_arr

In [None]:
# Load 4MB chunks afresh

import numpy as np
import zarr

# Define data location
fsstore = zarr.storage.FSStore("./fsstore3D")
# Load data - partial_decompress and full decompress
load_partial_decomp = zarr.Array(fsstore, partial_decompress=True)
load_disk_normally = zarr.Array(fsstore, partial_decompress=False)
# chunk_shape = (2048, 512)

In [None]:
load_partial_decomp.info