In [None]:
# Install zarr and compression libraries
!pip install zarr numpy matplotlib blosc lz4 zstandard numcodecs

# Zarr Tutorial - Chunked & Compressed Array Storage

**Zarr** is a library for chunked, compressed, N-dimensional arrays:
- **Compression**: Dramatically reduces storage size (2-100x smaller)
- **Chunking**: Efficient partial loading of massive arrays
- **Performance**: Optimized I/O for scientific computing
- **Compatibility**: Works with NumPy, Dask, and cloud storage

Perfect for: satellite imagery, climate data, genomics, high-resolution simulations.

In [None]:
import zarr
import numcodecs
import numpy as np
import matplotlib.pyplot as plt
import os
import time
from pathlib import Path
import shutil

# Set up paths
data_dir = Path('zarr_data')
data_dir.mkdir(exist_ok=True)

print(f"Zarr version: {zarr.__version__}")
print(f"Data directory: {data_dir.absolute()}")

## 📊 Basic Zarr Operations

In [None]:
# Create sample data of different types
np.random.seed(42)

# Different data patterns for compression testing
sparse_data = np.random.choice([0, 1], size=(1000, 1000), p=[0.95, 0.05])  # Very sparse
random_data = np.random.randn(1000, 1000).astype(np.float32)  # Random noise
structured_data = np.sin(np.linspace(0, 20*np.pi, 1000))[:, None] * np.cos(np.linspace(0, 10*np.pi, 1000))
integer_data = np.random.randint(0, 100, size=(1000, 1000), dtype=np.int32)

datasets = {
    'sparse': sparse_data,
    'random': random_data,
    'structured': structured_data.astype(np.float32),
    'integer': integer_data
}

print("Sample datasets created:")
for name, data in datasets.items():
    size_mb = data.nbytes / 1024**2
    print(f"  {name:10s}: {data.shape} {data.dtype} ({size_mb:.1f} MB)")

In [None]:
# Visualize the different data types
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

for i, (name, data) in enumerate(datasets.items()):
    im = axes[i].imshow(data[:200, :200], cmap='viridis')
    axes[i].set_title(f'{name.title()} Data (200x200 subset)')
    plt.colorbar(im, ax=axes[i])

plt.tight_layout()
plt.show()

## 🗜️ Compression Comparison

In [None]:
# Test different compression algorithms
compressors = {
    'none': None,
    'zlib': numcodecs.Zlib(level=6),
    'blosc_lz4': numcodecs.Blosc(cname='lz4',  clevel=5, shuffle=numcodecs.Blosc.SHUFFLE),
    'blosc_zstd': numcodecs.Blosc(cname='zstd', clevel=3, shuffle=numcodecs.Blosc.SHUFFLE),
    'blosc_blosclz': numcodecs.Blosc(cname='blosclz', clevel=5, shuffle=numcodecs.Blosc.SHUFFLE),
}

def test_compression(data, name):
    results = {}
    original_size = data.nbytes
    print(f"\nTesting compression on {name} data ({original_size/1024**2:.1f} MB):")

    # make chunk shape match ndim
    chunks = tuple(min(100, s) for s in data.shape)

    for comp_name, compressor in compressors.items():
        store_path = data_dir / f'{name}_{comp_name}.zarr'
        if store_path.exists():
            shutil.rmtree(store_path)

        t0 = time.perf_counter()
        z = zarr.open(
            str(store_path),
            mode='w',
            shape=data.shape,
            dtype=data.dtype,
            chunks=chunks,
            compressor=compressor,   # valid for Zarr v2
            zarr_format=2            # <-- force V2 to allow `compressor=`
        )
        z[:] = data
        write_time = time.perf_counter() - t0

        # size on disk
        compressed_size = sum(f.stat().st_size for f in store_path.rglob('*') if f.is_file())
        ratio = (original_size / compressed_size) if compressed_size else 0.0

        # read speed
        t0 = time.perf_counter()
        _ = z[:]
        read_time = time.perf_counter() - t0

        results[comp_name] = dict(size=compressed_size, ratio=ratio,
                                  write_time=write_time, read_time=read_time)

        print(f"  {comp_name:12s}: {compressed_size/1024**2:6.1f} MB "
              f"({ratio:5.1f}x) W:{write_time:.3f}s R:{read_time:.3f}s")
    return results

# run
all_results = {name: test_compression(data, name) for name, data in datasets.items()}


In [None]:
# Visualize compression results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Compression ratios
comp_names = list(compressors.keys())[1:]  # Skip 'none'
data_names = list(datasets.keys())

ratios = np.array([[all_results[data][comp]['ratio'] for comp in comp_names] 
                   for data in data_names])

im = axes[0,0].imshow(ratios, cmap='Reds')
axes[0,0].set_xticks(range(len(comp_names)))
axes[0,0].set_xticklabels(comp_names, rotation=45)
axes[0,0].set_yticks(range(len(data_names)))
axes[0,0].set_yticklabels(data_names)
axes[0,0].set_title('Compression Ratios')

# Add text annotations
for i in range(len(data_names)):
    for j in range(len(comp_names)):
        axes[0,0].text(j, i, f'{ratios[i,j]:.1f}x', ha='center', va='center')

plt.colorbar(im, ax=axes[0,0])

# Write times
write_times = np.array([[all_results[data][comp]['write_time'] for comp in comp_names] 
                        for data in data_names])

im = axes[0,1].imshow(write_times, cmap='Blues')
axes[0,1].set_xticks(range(len(comp_names)))
axes[0,1].set_xticklabels(comp_names, rotation=45)
axes[0,1].set_yticks(range(len(data_names)))
axes[0,1].set_yticklabels(data_names)
axes[0,1].set_title('Write Times (seconds)')
plt.colorbar(im, ax=axes[0,1])

# Read times
read_times = np.array([[all_results[data][comp]['read_time'] for comp in comp_names] 
                       for data in data_names])

im = axes[1,0].imshow(read_times, cmap='Greens')
axes[1,0].set_xticks(range(len(comp_names)))
axes[1,0].set_xticklabels(comp_names, rotation=45)
axes[1,0].set_yticks(range(len(data_names)))
axes[1,0].set_yticklabels(data_names)
axes[1,0].set_title('Read Times (seconds)')
plt.colorbar(im, ax=axes[1,0])

# Storage sizes
sizes = np.array([[all_results[data][comp]['size']/1024**2 for comp in comp_names] 
                  for data in data_names])

im = axes[1,1].imshow(sizes, cmap='Purples')
axes[1,1].set_xticks(range(len(comp_names)))
axes[1,1].set_xticklabels(comp_names, rotation=45)
axes[1,1].set_yticks(range(len(data_names)))
axes[1,1].set_yticklabels(data_names)
axes[1,1].set_title('Storage Size (MB)')
plt.colorbar(im, ax=axes[1,1])

plt.tight_layout()
plt.show()

## 🧩 Chunking Strategies

In [None]:
import numpy as np, zarr, shutil, time, numcodecs
from pathlib import Path

data_dir = Path('./zarr_data')
data_dir.mkdir(exist_ok=True)

# Test different chunking strategies
large_data = np.random.randn(2000, 2000).astype(np.float32)

chunk_strategies = {
    'small': (50, 50),
    'medium': (200, 200), 
    'large': (500, 500),
    'row': (1, 2000),
    'col': (2000, 1),
    'auto': None  # Let zarr decide automatically
}

def test_chunking(data, chunk_size, name):
    """Test chunking strategy"""
    store_path = data_dir / f'chunking_{name}.zarr'
    if store_path.exists():
        shutil.rmtree(store_path)

    # Create with chunking
    t0 = time.perf_counter()
    z = zarr.open(
        str(store_path), mode='w', shape=data.shape, dtype=data.dtype,
        compressor=numcodecs.Blosc(cname='lz4', clevel=5),
        chunks=chunk_size, zarr_format=2   # <-- force v2 here
    )
    z[:] = data
    write_time = time.perf_counter() - t0

    # Partial read
    t0 = time.perf_counter()
    subset = z[800:1200, 800:1200]
    partial_read_time = time.perf_counter() - t0

    # Full read
    t0 = time.perf_counter()
    full_data = z[:]
    full_read_time = time.perf_counter() - t0

    # Storage size
    size = sum(f.stat().st_size for f in store_path.rglob('*') if f.is_file())

    print(f"Chunks {str(chunk_size):12s}: "
          f"Write {write_time:.3f}s | "
          f"Partial {partial_read_time:.3f}s | "
          f"Full {full_read_time:.3f}s | "
          f"Size {size/1024**2:.1f}MB")

    return dict(
        chunks=z.chunks,
        write_time=write_time,
        partial_read_time=partial_read_time,
        full_read_time=full_read_time,
        size=size
    )

print(f"Testing chunking on {large_data.shape} array "
      f"({large_data.nbytes/1024**2:.1f} MB):")
print("Strategy        : Write Time | Partial Read | Full Read | Storage")

chunking_results = {}
for name, chunks in chunk_strategies.items():
    chunking_results[name] = test_chunking(large_data, chunks, name)


## 🔄 Advanced Features

In [None]:
# Multi-dimensional arrays and groups
print("Creating multi-dimensional dataset...")

# Create a 4D dataset (time, z, y, x) - like climate data
time_steps = 100
z_levels = 20
y_size = 200
x_size = 200

# Simulate climate data
np.random.seed(42)
temperature = 15 + 10 * np.sin(np.linspace(0, 4*np.pi, time_steps))[:, None, None, None] + \
              np.random.randn(time_steps, z_levels, y_size, x_size) * 2
pressure = 1013 + np.random.randn(time_steps, z_levels, y_size, x_size) * 10
humidity = np.random.beta(2, 2, (time_steps, z_levels, y_size, x_size)) * 100

print(f"Generated climate data: {temperature.shape} float32")
print(f"Total size: {(temperature.nbytes + pressure.nbytes + humidity.nbytes)/1024**3:.2f} GB")

In [None]:
# Create Zarr group (like HDF5 groups)
group_path = data_dir / 'climate_data.zarr'
if group_path.exists():
    shutil.rmtree(group_path)

start_time = time.time()

# Create root group
root = zarr.open_group(str(group_path), mode='w')

# Add metadata
root.attrs['title'] = 'Climate Simulation Data'
root.attrs['created'] = time.strftime('%Y-%m-%d %H:%M:%S')
root.attrs['description'] = '4D climate data with time, z-level, lat, lon dimensions'

# Create subgroups
atmos = root.create_group('atmosphere')
coords = root.create_group('coordinates')

# Store main data with optimal chunking for time-series access
temp_array = atmos.create_dataset('temperature', 
                                 shape=temperature.shape,
                                 dtype=np.float32,
                                 chunks=(10, 5, 50, 50),  # Time-friendly chunks
                                 compressor=zarr.Blosc(cname='zstd', clevel=3))

pres_array = atmos.create_dataset('pressure',
                                 shape=pressure.shape,
                                 dtype=np.float32,
                                 chunks=(10, 5, 50, 50),
                                 compressor=zarr.Blosc(cname='zstd', clevel=3))

humid_array = atmos.create_dataset('humidity',
                                  shape=humidity.shape,
                                  dtype=np.float32,
                                  chunks=(10, 5, 50, 50),
                                  compressor=zarr.Blosc(cname='zstd', clevel=3))

# Add coordinate arrays
coords.array('time', np.arange(time_steps), chunks=50)
coords.array('z_level', np.linspace(0, 20000, z_levels), chunks=20)  # meters
coords.array('latitude', np.linspace(-90, 90, y_size), chunks=50)
coords.array('longitude', np.linspace(-180, 180, x_size), chunks=50)

# Write data
temp_array[:] = temperature.astype(np.float32)
pres_array[:] = pressure.astype(np.float32) 
humid_array[:] = humidity.astype(np.float32)

# Add attributes to arrays
temp_array.attrs['units'] = 'degrees_celsius'
temp_array.attrs['long_name'] = 'Air Temperature'
pres_array.attrs['units'] = 'hPa'
pres_array.attrs['long_name'] = 'Air Pressure'
humid_array.attrs['units'] = 'percent'
humid_array.attrs['long_name'] = 'Relative Humidity'

write_time = time.time() - start_time
total_size = sum(f.stat().st_size for f in group_path.rglob('*') if f.is_file())
original_size = temperature.nbytes + pressure.nbytes + humidity.nbytes

print(f"\nClimate dataset created in {write_time:.2f}s:")
print(f"  Original size: {original_size/1024**3:.2f} GB")
print(f"  Compressed size: {total_size/1024**2:.1f} MB")
print(f"  Compression ratio: {original_size/total_size:.1f}x")
print(f"  Structure: {root.tree()}")

In [None]:
# Demonstrate efficient data access patterns
print("Testing access patterns:")

# Reload the data
root = zarr.open_group(str(group_path), mode='r')
temp_data = root['atmosphere/temperature']

# Time series at specific location
start_time = time.time()
time_series = temp_data[:, 10, 100, 100]  # All time, one z-level, one location
time_access = time.time() - start_time

# Spatial slice at specific time
start_time = time.time()
spatial_slice = temp_data[50, 10, :, :]  # One time, one z-level, all locations
spatial_access = time.time() - start_time

# Vertical profile at specific time/location
start_time = time.time()
profile = temp_data[50, :, 100, 100]  # One time, all z-levels, one location
profile_access = time.time() - start_time

print(f"  Time series (100 points): {time_access:.3f}s")
print(f"  Spatial slice (200x200): {spatial_access:.3f}s")
print(f"  Vertical profile (20 levels): {profile_access:.3f}s")

# Visualize different access patterns
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

axes[0].plot(time_series)
axes[0].set_title('Time Series at Location (100,100,10)')
axes[0].set_xlabel('Time Step')
axes[0].set_ylabel('Temperature (°C)')

im = axes[1].imshow(spatial_slice, cmap='RdYlBu_r')
axes[1].set_title('Spatial Slice at Time=50, Z=10')
plt.colorbar(im, ax=axes[1], label='Temperature (°C)')

axes[2].plot(profile, range(len(profile)))
axes[2].set_title('Vertical Profile at (100,100,t=50)')
axes[2].set_xlabel('Temperature (°C)')
axes[2].set_ylabel('Z Level')
axes[2].invert_yaxis()

plt.tight_layout()
plt.show()

## 💾 Disk Usage Analysis

In [None]:
# Analyze disk usage of all created files
def analyze_directory(path):
    """Analyze zarr directory structure and sizes"""
    total_size = 0
    file_count = 0
    
    if path.exists():
        for file_path in path.rglob('*'):
            if file_path.is_file():
                size = file_path.stat().st_size
                total_size += size
                file_count += 1
    
    return total_size, file_count

print("Disk Usage Analysis:")
print("=" * 50)

total_zarr_size = 0

# Analyze individual compression tests
for data_name in datasets.keys():
    print(f"\n{data_name.title()} Data Compression:")
    original_mb = datasets[data_name].nbytes / 1024**2
    
    for comp_name in compressors.keys():
        store_path = data_dir / f'{data_name}_{comp_name}.zarr'
        size, files = analyze_directory(store_path)
        total_zarr_size += size
        
        if size > 0:
            ratio = datasets[data_name].nbytes / size
            print(f"  {comp_name:12s}: {size/1024**2:6.1f} MB ({files:3d} files) {ratio:5.1f}x compression")

# Analyze chunking tests
print(f"\nChunking Strategy Analysis:")
original_mb = large_data.nbytes / 1024**2
for strategy in chunk_strategies.keys():
    store_path = data_dir / f'chunking_{strategy}.zarr'
    size, files = analyze_directory(store_path)
    total_zarr_size += size
    
    if size > 0:
        print(f"  {strategy:10s}: {size/1024**2:6.1f} MB ({files:3d} files)")

# Analyze climate dataset
climate_size, climate_files = analyze_directory(group_path)
total_zarr_size += climate_size
print(f"\nClimate Dataset: {climate_size/1024**2:.1f} MB ({climate_files} files)")

print(f"\nTotal Zarr Storage: {total_zarr_size/1024**2:.1f} MB")
print(f"Total Files Created: {sum(analyze_directory(data_dir)[1:])}

## ⚡ Performance vs NumPy

In [None]:
# Compare Zarr vs NumPy for large array operations
print("Performance Comparison: Zarr vs NumPy")
print("=" * 40)

# Create test data
test_size = (2000, 2000)
test_data = np.random.randn(*test_size).astype(np.float32)

# NumPy save/load
numpy_file = data_dir / 'test_numpy.npy'
start_time = time.time()
np.save(str(numpy_file), test_data)
numpy_write_time = time.time() - start_time

start_time = time.time()
numpy_loaded = np.load(str(numpy_file))
numpy_read_time = time.time() - start_time

numpy_size = numpy_file.stat().st_size

# Zarr save/load
zarr_file = data_dir / 'test_zarr.zarr'
if zarr_file.exists():
    shutil.rmtree(zarr_file)

start_time = time.time()
z = zarr.open(str(zarr_file), mode='w', shape=test_data.shape, dtype=test_data.dtype,
             compressor=zarr.Blosc(cname='lz4', clevel=5), chunks=(200, 200))
z[:] = test_data
zarr_write_time = time.time() - start_time

start_time = time.time()
zarr_loaded = z[:]
zarr_read_time = time.time() - start_time

zarr_size = sum(f.stat().st_size for f in zarr_file.rglob('*') if f.is_file())

# Partial read test (central 400x400)
start_time = time.time()
numpy_partial = numpy_loaded[800:1200, 800:1200]
numpy_partial_time = time.time() - start_time

start_time = time.time()
zarr_partial = z[800:1200, 800:1200]
zarr_partial_time = time.time() - start_time

# Results
print(f"Array size: {test_size} ({test_data.nbytes/1024**2:.1f} MB)\n")

print(f"{'Metric':20s} {'NumPy':>10s} {'Zarr':>10s} {'Speedup':>10s}")
print("-" * 60)
print(f"{'Write time (s)':20s} {numpy_write_time:>10.3f} {zarr_write_time:>10.3f} {zarr_write_time/numpy_write_time:>10.2f}x")
print(f"{'Read time (s)':20s} {numpy_read_time:>10.3f} {zarr_read_time:>10.3f} {zarr_read_time/numpy_read_time:>10.2f}x")
print(f"{'Partial read (s)':20s} {numpy_partial_time:>10.3f} {zarr_partial_time:>10.3f} {zarr_partial_time/numpy_partial_time:>10.2f}x")
print(f"{'Storage (MB)':20s} {numpy_size/1024**2:>10.1f} {zarr_size/1024**2:>10.1f} {numpy_size/zarr_size:>10.1f}x")

print(f"\n🎯 Key Benefits:")
print(f"   • Storage reduction: {numpy_size/zarr_size:.1f}x smaller")
print(f"   • Partial reads: {numpy_partial_time/zarr_partial_time:.1f}x faster (chunked access)")
print(f"   • Flexible compression: Multiple algorithms available")

## 📋 Zarr Quick Reference

In [None]:
# Clean up test files (optional)
cleanup = input("Clean up test files? (y/n): ").lower().startswith('y')

if cleanup:
    shutil.rmtree(data_dir)
    print("Test files cleaned up.")
else:
    print(f"Test files preserved in {data_dir.absolute()}")

reference = """
ZARR QUICK REFERENCE:

Basic Operations:
  import zarr
  z = zarr.open('data.zarr', mode='w', shape=(1000, 1000), dtype='f4')
  z[:] = data                    # Write data
  loaded = z[:]                  # Read all
  subset = z[100:200, 50:150]    # Read subset

Compression:
  zarr.Blosc(cname='lz4', clevel=5)       # Fast compression
  zarr.Blosc(cname='zstd', clevel=3)      # Balanced
  zarr.Blosc(cname='blosclz', clevel=9)   # High compression
  zarr.Zlib(level=6)                      # Standard

Chunking:
  chunks=(100, 100)              # Fixed chunks
  chunks=True                    # Auto chunks
  chunks=(1, -1)                 # Row-wise chunks

Groups (like HDF5):
  root = zarr.open_group('data.zarr', mode='w')
  root.create_dataset('array1', data=data)
  group = root.create_group('subgroup')
  root.attrs['metadata'] = 'value'

Best Practices:
  • Choose chunks based on access patterns
  • Use appropriate data types (float32 vs float64)
  • Test different compressors for your data
  • Use groups for organizing related datasets
  • Add metadata with .attrs dictionary
"""

print(reference)
print("\n🚀 Zarr: Efficient storage for massive arrays! 💾")