In [None]:
import tempfile
from dataclasses import dataclass

import dask
import icechunk
import icechunk as ic
import xarray as xr
from distributed import Client

In [None]:
client = Client(n_workers=2)
client

In [None]:
# use the x,y coord dataset basin mask
ds = xr.tutorial.open_dataset('basin_mask').isel(Z=0).drop_vars(['Z']).chunk()
# go to -180/180
ds.coords['X'] = (ds.coords['X'] + 180) % 360 - 180
ds = ds.sortby(ds.X)
ds

In [None]:
# super helpful bit from the icechunk docs!
# must be a integer multiple of ZARR_CHUNK_SIZE

ZARR_CHUNK_SIZE = (180, 180)
DASK_CHUNK_SIZE = (180, 180)

In [None]:
storage = icechunk.local_filesystem_storage(tempfile.TemporaryDirectory().name)
repo = icechunk.Repository.create(storage)
session = repo.writable_session('main')
ds.to_zarr(
    session.store,
    compute=False,
    mode='w',
    encoding={'basin': {'chunks': ZARR_CHUNK_SIZE, 'fill_value': 0}},
    consolidated=False,
)
session.commit('create the template')

In [None]:
# weird way to specify a region, but just for a test
@dataclass
class Subset:
    xstart: int
    xend: int
    ystart: int
    yend: int
    region_id: str

    @property
    def xslice(self) -> slice(int, int):
        return slice(self.xstart, self.xend)

    @property
    def yslice(self) -> slice(int, int):
        return slice(self.ystart, self.yend)


# slice1 = Subset(0,180,)
# slice2 = Subset(180,360)
# inputs_slices = [slice1, slice2]

In [None]:
ul = Subset(xstart=-180, xend=0, ystart=0, yend=90, region_id='ul')
ur = Subset(xstart=0, xend=180, ystart=0, yend=90, region_id='ur')
ll = Subset(xstart=-180, xend=180, ystart=-90, yend=0, region_id='ll')
lr = Subset(xstart=-180, xend=0, ystart=-90, yend=0, region_id='lr')
inputs_slices = [ul, ur, ll, lr]

In [None]:
@dask.delayed
def update_tile(session: ic.Session, ds: xr.Dataset, slice: Subset):
    ds_subset = ds.sel(X=slice.xslice, Y=slice.yslice)
    ds_subset['basin'].encoding = {}
    # How can we insure idempotenticiy?
    # and how can we override to rewrite!
    # can we use the icechunk - geohash/quadkey/region_ID for this?

    ds_subset.to_zarr(
        session.store,
        region='auto',
        consolidated=False,
    )
    # can we commit the here + add a region ID (think quadkey ID) like:
    # session.commit(f'{slice.region_id}')

    return session

In [None]:
repo = ic.Repository.open(storage)
session = repo.writable_session('main')

# gross, but grab the first/latest commit
already_commited_regions = list(repo.ancestry(branch='main'))[0].message.split('_')

# just an example
# already_commited_regions = ['ul', 'ur', 'll']
uncommited = [input for input in inputs_slices if input.region_id not in already_commited_regions]

with session.allow_pickling():
    tasks = [update_tile(session=session, ds=ds, slice=slice) for slice in uncommited]
    # we could persist or w/e here
    sessions = dask.compute(*tasks, scheduler=client)

In [None]:
ids = [subset.region_id for subset in uncommited]
commit_region_ids = '_'.join(ids)

In [None]:
# merge the sessions
from icechunk.distributed import merge_sessions

session = merge_sessions(session, *sessions)
session.commit(f'{commit_region_ids}')

In [None]:
repo = ic.Repository.open(storage)
list(repo.ancestry(branch='main'))