In [1]:
import xarray as xr
import zarr
from pathlib import Path
from numcodecs import Blosc


In [2]:
def combine_zarrs_along_time(source_zarr_paths, target_zarr_dir, target_zarr_prefix, chunking=None):
    """
    Combine multiple Zarr archives along the time dimension and write to a new Zarr.
    
    Parameters:
    -----------
    source_zarr_paths : list of zarr or path to zarr
        Paths to the source Zarr archives.
    target_zarr_path : str
        Path to the output combined Zarr archive.
    chunking : dict, optional
        Chunk sizes for the output Zarr (e.g., {'time': 48, 'cell': 262144}).
    """
    if isinstance(source_zarr_paths, list):
        print(f"Opening {len(source_zarr_paths)} Zarr archives...")
        datasets = [xr.open_zarr(z) for z in source_zarr_paths]
        print("Concatenating along time...")
        combined = xr.concat(datasets, dim="time")
    elif isinstance(source_zarr_paths, Path):
        combined = xr.open_zarr(source_zarr_paths)
    else:
        raise ValueError("input needs to be list of zarrs or a path to a zarr")
    # Set up encoding for chunking
    if chunking is None:
        chunking = {"time": 36, "cell": 48}
    combined = combined.chunk(chunking)

    stime = combined.time[0].dt.strftime("%Y%m%d%H%M")
    etime = combined.time[-1].dt.strftime("%Y%m%d%H%M")
    time_label = f"{stime.item()}-{etime.item()}"

    tgt_zarr = target_zarr_dir / f"{target_zarr_prefix}_{time_label}.zarr"

    encoding = {}
    for var in combined.data_vars:
        # Get the chunks as a flat tuple
        var_chunks = tuple(x[0] if isinstance(x, tuple) else x 
                         for x in combined[var].chunks)
        encoding[var] = {
            "chunks": var_chunks,
            "compressor": Blosc(cname='zstd', clevel=3, shuffle=2)
        }
    
    print(f"Writing combined dataset to {tgt_zarr} ...")
    combined.to_zarr(tgt_zarr, 
                     mode="w", 
                     zarr_format=2, 
                     consolidated=True, 
                     encoding=encoding,
                     compute=True
                     )
    print("Done.")

# Example usage:
# source_zarrs = ["file1.zarr", "file2.zarr", "file3.zarr"]
# combine_zarrs_along_time(source_zarrs, "combined.zarr", chunking={'time': 48, 'cell': 262144})

In [3]:

# src_loc = Path("/glade/derecho/scratch/digital-earths-hackathon/mpas_DYAMOND3/30min")
# src_zarr = sorted(src_loc.glob("DYAMOND_diag_30min_to_hp1.zarr"))

# tgt_zarr_loc = Path("/glade/campaign/cgd/cas/brianpm/hack25/rechunk")
# tgt_zarr_prefix = "DYAMOND_diag_30min_to_hp1"


# DYAMOND2 15min
src_loc = Path("/glade/derecho/scratch/digital-earths-hackathon/mpas_DYAMOND2/15min")
# src_zarr = sorted(src_loc.glob("DYAMOND2_diag_15min_to_hp1.zarr"))
src_zarr = src_loc / "DYAMOND2_diag_15min_to_hp1.zarr"

tgt_zarr_loc = Path("/glade/campaign/cgd/cas/brianpm/hack25/rechunk")
tgt_zarr_prefix = "DYAMOND2_diag_15min_to_hp1"

ochunks = {'time':96, 'cell':48}
combine_zarrs_along_time(src_zarr, tgt_zarr_loc, tgt_zarr_prefix, chunking=ochunks)

ValueError: Failed to decode variable 'time': unable to decode time units 'hours since 2000-01-01 00:00:00' with "calendar 'time_calendar'". Try opening your dataset with decode_times=False or installing cftime if it is not installed.

In [None]:
# check input
dsi = xr.open_zarr(src_zarr, decode_times=False, )

In [10]:
dsi['time'].attrs['calendar'] = 'standard'
dsi['time'].attrs['units'] = 'seconds since 2000-01-01 00:00:00'

In [6]:
dsi['time']

In [11]:
dsi = xr.decode_cf(dsi)

ValueError: Failed to decode variable 'time': unable to decode time units 'seconds since 2000-01-01 00:00:00' with "calendar 'standard'". Try opening your dataset with decode_times=False or installing cftime if it is not installed.

In [None]:
# Check on the output:
# dso = xr.open_zarr("/glade/campaign/cgd/cas/brianpm/hack25/rechunk/DYAMOND_diag_30min_to_hp1_202001200000-202002132330.zarr")

In [13]:
dso['cape'].compute()

In [None]:
time_label

'202001200000-202001240900'

In [21]:
tgt_zarr = Path("/glade/campaign/cgd/cas/brianpm/hack25/rechunk")

# Save the combined dataset to a new Zarr archive
combined_ds.to_zarr(tgt_zarr / f'DYAMOND_diag_3h_{time_label}_to_hp1.zarr', zarr_format=2)

  combined_ds.to_zarr(tgt_zarr / f'DYAMOND_diag_3h_{time_label}_to_hp1.zarr')


<xarray.backends.zarr.ZarrStore at 0x7fce3febe170>

In [17]:
ds = xr.open_mfdataset("/glade/campaign/mmm/wmr/fjudt/projects/dyamond_2/3.75km/history.2020-*.nc")

ValueError: Could not find any dimension coordinates to use to order the datasets for concatenation