In [1]:
import numpy as np
from tqdm import tqdm
import wandb as wb
import imageio
from io import BytesIO
import pickle
import zarr
from pathlib import Path
import os
import random

from src.codecs.imagecodecs import Jpeg2k, register_codecs

In [2]:
register_codecs()

In [3]:
def create_gif_or_mp4(np_images, filename, fps=10):
    # duration = 1000 / fps
    with imageio.get_writer(filename, fps=fps) as writer:
        for img in tqdm(np_images):
            writer.append_data(img)
    print(f"File saved as {filename}")

In [2]:
data_base_dir = Path(os.environ.get("FURNITURE_DATA_DIR", "data"))

In [5]:
old_zarr = zarr.open(
    "/data/scratch/ankile/furniture-data-old/data/processed/sim/image_small/one_leg/data.zarr",
    mode="r",
)

In [6]:
old_zarr["color_image1"][0].shape

(224, 224, 3)

In [7]:
old_zarr["color_image1"].shape

(370234, 224, 224, 3)

In [8]:
image_compressor = Jpeg2k(level=75)

In [9]:
output_store = zarr.open(
    "/data/scratch/ankile/furniture-data-old/data/processed/sim/image_small/one_leg/data_compressed.zarr",
    mode="w",
)

In [10]:
_ = output_store.require_dataset(
    name="color_image1",
    shape=(10_000,) + old_zarr["color_image1"].shape[1:],
    chunks=(224, 224, 3),
    compressor=image_compressor,
    dtype=np.uint8,
)

# for i in tqdm(range(len(old_zarr["color_image1"]))):
output_store["color_image1"][:] = old_zarr["color_image1"][:10_000]

## Have a look at the compressed data to verify that it looks alright

In [6]:
store = zarr.open(
    "/data/scratch/ankile/furniture-data-old/data/processed/sim/image_small/one_leg/data_compressed.zarr",
)

## Inspect the current image data file we're using

In [8]:
datapath = data_base_dir / "processed/sim/image_small/one_leg/data.zarr"

store = zarr.open(datapath, mode="r")

In [9]:
store["color_image1"].shape, store["color_image1"].chunks

((370234, 224, 224, 3), (1, 224, 224, 3))

In [10]:
# Check the storage size of the dataset in GB
store["color_image1"].nbytes / 1e9

55.730583552

In [None]:
# Time how long it takes to iterate through the dataset
for i in tqdm(range(0, len(store["color_image1"]), 16)):
    store["color_image1"][i : i + 16]

 78%|███████▊  | 17987/23140 [35:23<10:08,  8.47it/s]  


KeyboardInterrupt: 

## Test different chunking strategies

### JPEG compression with one image per chunk

In [23]:
output1 = zarr.open(
    "/data/scratch/ankile/tmp-compression-test/data_chunk_1_jpeg.zarr",
    mode="w",
)

In [25]:
_ = output1.require_dataset(
    name="color_image1",
    shape=(10_000,) + store["color_image1"].shape[1:],
    chunks=(1, 224, 224, 3),
    compressor=image_compressor,
    dtype=np.uint8,
)

output1["color_image1"][:] = store["color_image1"][:10_000]

In [4]:
# Time how long it takes to do 10_000 random reads
random_reads = random.sample(range(0, 10_000), 10_000)

In [None]:
for i in tqdm(random_reads):
    output1["color_image1"][i : i + 16]

### Default compression with 10 images per chunk    

In [11]:
output2 = zarr.open(
    "/data/scratch/ankile/tmp-compression-test/data_chunk_10_default.zarr",
    mode="w",
)
_ = output2.require_dataset(
    name="color_image1",
    shape=(50_000,) + store["color_image1"].shape[1:],
    chunks=(10, 224, 224, 3),
    # compressor=image_compressor,
    dtype=np.uint8,
)

output2["color_image1"][:] = store["color_image1"][:50_000]

In [14]:
# Time how long it takes to iterate through the dataset
for i in tqdm(random_reads):
    output2["color_image1"][i : i + 16]

100%|██████████| 10000/10000 [00:20<00:00, 482.74it/s]


### Default compression with 100 images per chunk

In [15]:
output3 = zarr.open(
    "/data/scratch/ankile/tmp-compression-test/data_chunk_100_default.zarr",
    mode="w",
)
_ = output3.require_dataset(
    name="color_image1",
    shape=(50_000,) + store["color_image1"].shape[1:],
    chunks=(100, 224, 224, 3),
    # compressor=image_compressor,
    dtype=np.uint8,
)

output3["color_image1"][:] = store["color_image1"][:50_000]

In [None]:
# Time how long it takes to iterate through the dataset
for i in tqdm(random_reads):
    output3["color_image1"][i : i + 16]

100%|██████████| 10000/10000 [00:30<00:00, 333.10it/s]


### Default compression with 32 images per chunk


In [None]:
output4 = zarr.open(
    "/data/scratch/ankile/tmp-compression-test/data_chunk_32_default.zarr",
    mode="w",
)
_ = output4.require_dataset(
    name="color_image1",
    shape=(50_000,) + store["color_image1"].shape[1:],
    chunks=(32, 224, 224, 3),
    # compressor=image_compressor,
    dtype=np.uint8,
)

output4["color_image1"][:] = store["color_image1"][:50_000]

In [None]:
# Time how long it takes to iterate through the dataset
for i in tqdm(random_reads):
    output4["color_image1"][i : i + 16]

100%|██████████| 10000/10000 [00:20<00:00, 482.22it/s]


## Make a copy of the original data file with chunksize 32

In [44]:
outdatapath = data_base_dir / "processed/sim/image_small/one_leg/data_chunks_32.zarr"

In [48]:
# Make a new dataset with the same data but different chunking
output_store = zarr.open(outdatapath, mode="w")

for key in store.keys():
    print(key)
    _ = output_store.require_dataset(
        name=key,
        shape=store[key].shape,
        chunks=(32,) + store[key].chunks[1:],
        # compressor=image_compressor,
        dtype=store[key].dtype,
    )

    for i in tqdm(range(0, len(store[key]), 5_000)):
        output_store[key][i : i + 5_000] = store[key][i : i + 5_000]

color_image1


  8%|▊         | 6/75 [02:31<29:05, 25.29s/it]


KeyboardInterrupt: 

## Look at the new, full dataset with chunksize 32

In [3]:
datapath32 = data_base_dir / "processed/sim/image_small/one_leg/data_batch_32.zarr"

store32 = zarr.open(datapath32, mode="r")

In [5]:
for i in tqdm(random_reads):
    store32["color_image1"][i : i + 16]

 26%|██▋       | 2640/10000 [00:37<01:45, 69.77it/s] 


KeyboardInterrupt: 

## Inspect the same file just located on the Stata scratch NFS mount