In [None]:
import requests
import rasterio
import rioxarray
import xarray as xr
import cartopy.crs as ccrs
from pyproj import CRS
import ujson
import s3fs
import fsspec
from tqdm import tqdm
from glob import glob
import os
import pathlib
import hvplot.xarray

from kerchunk.hdf import SingleHdf5ToZarr
from kerchunk.combine import MultiZarrToZarr

# The xarray produced from the reference file throws a SerializationWarning for each variable. Will need to explore why
import warnings
warnings.simplefilter("ignore")

In [None]:
import dask
from dask.distributed import Client
client = Client(n_workers=4)
client

In [None]:
bucket = 's3://npwbanalres'

In [None]:
s3 = s3fs.S3FileSystem(anon=False)

In [None]:
#s3.ls(bucket)
s3.glob(f'{bucket}/*.nc4')

In [None]:
rain_urls = s3.glob(f'{bucket}/*rain.nc4')

In [None]:
url = rain_urls[1]
url

In [None]:
s3_file_obj = s3.open(url, mode='rb')

In [None]:
xr_ds = xr.open_dataset(s3_file_obj, chunks='auto', engine='h5netcdf')

In [None]:
xr_ds

In [None]:
xr_ds.rain[0].where(xr_ds.rain[0] > 0, drop=True).plot()

In [None]:
xr_ds.lambert_conformal_conic.attrs

In [None]:
crs = CRS.from_cf(xr_ds.lambert_conformal_conic.attrs)

In [None]:
crs.to_dict()

In [None]:
#xr_ds.rain[0].where(xr_ds.rain[0] > 0, drop=True).hvplot.image(x='x', y='y', crs=crs, tiles="ESRI", clim=(0,10))

In [None]:
xr_ds.rain[0].where(xr_ds.rain[0] > 0, drop=True).hvplot.image(x='x', y='y', clim=(0,10))

In [None]:
url2 = rain_urls[2]
url2

In [None]:
s3_file_obj_2 = s3.open(url2, mode='rb')

In [None]:
xr_ds_2 = xr.open_dataset(s3_file_obj_2, chunks='auto', engine='h5netcdf')

In [None]:
xr_ds_2.rain

In [None]:
xr.concat([xr_ds, xr_ds_2], 'time')

In [None]:
def gen_json(u):
    so = dict(
        mode= "rb", 
        anon= False, 
        default_fill_cache= False,
        default_cache_type= "none"
    )
    with s3.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        with open(f"jsons/{u.split('/')[-1]}.json", 'wb') as outf:
            outf.write(ujson.dumps(h5chunks.translate()).encode())

In [None]:
pathlib.Path('./jsons/').mkdir(exist_ok=True)

In [None]:
reference_files = []
for url in rain_urls:
    ref = dask.delayed(gen_json)(url)
    reference_files.append(ref)

reference_files_compute = dask.compute(*reference_files)

In [None]:
reference_list = sorted(glob('./jsons/*.json'))
reference_list

In [None]:
with open(reference_list[0]) as j:
    reference = ujson.load(j)

In [None]:
s_opts = {'skip_instance_cache':True}   #json
r_opts = {'anon':False}

In [None]:
fs_single = fsspec.filesystem("reference",
                              fo=reference,
                              ref_storage_args=s_opts,
                              remote_protocol='s3', 
                              remote_options=r_opts)

In [None]:
m = fs_single.get_mapper("")
ds_single = xr.open_dataset(m, engine="zarr", backend_kwargs={'consolidated':False}, chunks={})
ds_single

In [None]:
ds_k =[]
for ref in reference_list:
    s_opts = s_opts
    r_opts = r_opts
    fs = fsspec.filesystem("reference",
                           fo=ref,
                           ref_storage_args=s_opts,
                           remote_protocol='s3',
                           remote_options=r_opts)
    m = fs.get_mapper("")
    ds_k.append(xr.open_dataset(m, engine="zarr", backend_kwargs={'consolidated':False}, chunks={}))
    
ds_multi = xr.concat(ds_k, dim='time')
    
ds_multi

In [None]:
t_slice = ds_multi.sel(time=slice('2001-12-01', '2002-03-01'))
t_slice