In [None]:
import xarray as xr
import dask
import ujson
import s3fs
import fsspec
from glob import glob
import pathlib
import hvplot.xarray

from kerchunk.hdf import SingleHdf5ToZarr
from kerchunk.combine import MultiZarrToZarr

# The xarray produced from the reference file throws a SerializationWarning for each variable. Will need to explore why
import warnings
warnings.simplefilter("ignore")

## Start up a Dask Client to monitor Dask processing

In [None]:
from dask.distributed import Client
client = Client(n_workers=4)
client

## Identify the S3 bucket

In [None]:
bucket = 's3://npwbanalres'

## Connect to the S3 bucket and list files it contains

In [None]:
s3 = s3fs.S3FileSystem(anon=False)

In [None]:
#s3.ls(bucket)
urls_all = s3.glob(f'{bucket}/*.nc4')
urls_all

In [None]:
rain_urls = s3.glob(f'{bucket}/*rain.nc4')

## Create Kerchunk file for each data asset/.nc4 file

In [None]:
def gen_json(u):
    so = dict(
        mode= "rb", 
        anon= False, 
        default_fill_cache= False,
        default_cache_type= "none"
    )
    with s3.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        with open(f"jsons/{u.split('/')[-1]}.json", 'wb') as outf:
            outf.write(ujson.dumps(h5chunks.translate()).encode())

**Create a json directory to write Kurchunk files to**

In [None]:
pathlib.Path('./jsons/').mkdir(exist_ok=True)

**Use Dask to process multiple files in parallel**

In [None]:
reference_files = []
for url in rain_urls:
    ref = dask.delayed(gen_json)(url)
    reference_files.append(ref)

reference_files_compute = dask.compute(*reference_files)

**List kerchunk/json files**

In [None]:
reference_list = sorted(glob('./jsons/*.json'))

---

## Working with Kurchunck files

### Read a single Kurchunk file

In [None]:
with open(reference_list[0]) as j:
    reference = ujson.load(j)

In [None]:
s_opts = {'skip_instance_cache':True}
r_opts = {'anon':False}

In [None]:
fs_single = fsspec.filesystem("reference",
                              fo=reference,
                              ref_storage_args=s_opts,
                              remote_protocol='s3', 
                              remote_options=r_opts)

In [None]:
%%time
m = fs_single.get_mapper("")
ds_single = xr.open_dataset(m, engine="zarr", backend_kwargs={'consolidated':False}, chunks={})
ds_single

### Read multiple Kurchunck files

In [None]:
rain_urls = [x for x in reference_list if 'rain.nc4' in x]
rain_urls

In [None]:
%%time
ds_k =[]
for ref in rain_urls:
    s_opts = s_opts
    r_opts = r_opts
    fs = fsspec.filesystem("reference",
                           fo=ref,
                           ref_storage_args=s_opts,
                           remote_protocol='s3',
                           remote_options=r_opts)
    m = fs.get_mapper("")
    ds_k.append(xr.open_dataset(m, engine="zarr", backend_kwargs={'consolidated':False}, chunks={}))

In [None]:
%%time
ds_multi = xr.concat(ds_k, dim='time')
ds_multi

In [None]:
t_slice = ds_multi.sel(time=slice('2001-12-01', '2002-03-01'))
t_slice