This notebook creates a zarr dataset from all the files stored in the directory `'/storage/GOES/orthorectified/Fog2022_withtime/'`. The zarr dataset is stored at the path `'/storage/GOES/orthorectified/Fog2022_withtime_tmp.zarr'`. The saved zarr dataset is optimized for access along the time-index, i.e. it makes it very quick to grab time series for individual pixels. 

In [1]:
import os
import glob
import shutil
import xarray as xr
import zarr
from dask.distributed import Client, LocalCluster

In [2]:
fixed_image_folder = '/storage/GOES/orthorectified/Fog2022_withtime/'
zarr_output_path = '/storage/GOES/orthorectified/Fog2022_withtime.zarr'
tmp_zarr_output_path = '/storage/GOES/orthorectified/Fog2022_withtime_tmp.zarr'

In [3]:
# Create a Dask cluster so we can watch the dask dashboard
workers = 12
ip_addres = 'http://j-lundquist-3.ce.washington.edu'
port=':8787'
threads = 2
cluster = LocalCluster(n_workers=workers, threads_per_worker=threads, dashboard_address=port)
client = Client(cluster)

In [4]:
# Grab all the netcdf files
nc_files = sorted(glob.glob(os.path.join(fixed_image_folder, '*.nc')))

In [None]:
# Open all the files
ds = xr.open_mfdataset(nc_files, chunks={'time': 500})

In [None]:
# Rechunk in to 1e8 byte sized chunks
ds['HT'].data = ds['HT'].data.rechunk(
    {0:'auto', 1:'auto', 2:'auto'},
    block_size_limit=1e8,
    balance=True
)

In [None]:
# Create a temporary zarr file and write these chunks to disk
zarr_stack_tmp = tmp_zarr_output_path
shutil.rmtree(zarr_stack_tmp, ignore_errors=True)
ds.to_zarr(tmp_zarr_output_path)

In [None]:
source_group = zarr.open(zarr_stack_tmp)
source_array = source_group['HT']
print(source_group.tree())
print(source_array.info)
del source_group
del source_array

In [None]:
# Rechunk along time dimension and write to disk
shutil.rmtree(zarr_output_path, ignore_errors=True)

In [None]:
arr = ds['HT'].data.rechunk(
    {0:-1, 1:'auto', 2:'auto'}, 
    block_size_limit=1e8, 
    balance=True
)

t,y,x = arr.chunks[0][0], arr.chunks[1][0], arr.chunks[2][0]
ds = xr.open_dataset(
    zarr_stack_tmp,
    chunks={'time': t, 'latitude': y, 'longitude':x},
    engine='zarr'
)

ds['HT'].encoding = {'chunks': (t, y, x)}

ds.to_zarr(zarr_output_path)

In [None]:
source_group = zarr.open(zarr_output_path)
source_array = source_group['HT']
print(source_group.tree())
print(source_array.info)
del source_group
del source_array