# Nasa NEX with Kerchunk

In [65]:
import pandas as pd 
import xarray as xr 
import s3fs

import glob
import logging
from tempfile import TemporaryDirectory

import dask
import fsspec
import s3fs
import ujson
from distributed import Client
from kerchunk.combine import MultiZarrToZarr
from kerchunk.hdf import SingleHdf5ToZarr



In [60]:
## loading
df = pd.read_csv(
    "s3://carbonplan-climate-impacts/extreme-heat/v1.0/inputs/nex-gddp-cmip6-files.csv"
)
str_split_urls = df[" fileURL"].str.rsplit('/',expand=True) 
edf = pd.DataFrame()
edf['url'] = df[" fileURL"].replace({'https://nex-gddp-cmip6.s3.us-west-2.amazonaws.com/':'s3://nex-gddp-cmip6/'},regex=True)
edf['GCM'] = str_split_urls[4]
edf['scenario'] = str_split_urls[5]
edf['ensemble_member'] = str_split_urls[6]
edf['variable'] = str_split_urls[7]
# str strip to remove all whitespace
edf_obj = edf.select_dtypes(['object'])
edf[edf_obj.columns] = edf_obj.apply(lambda x: x.str.strip())



In [83]:
edf.query("GCM == 'ACCESS-CM2'  & scenario == 'historical' & ensemble_member == 'r1i1p1f1'")['variable'].unique()

array(['hurs', 'huss', 'pr', 'rlds', 'rsds', 'sfcWind', 'tasmax',
       'tasmin', 'tas'], dtype=object)

In [62]:
fs = s3fs.S3FileSystem(anon=True, default_fill_cache=False)
ds = xr.open_dataset(fs.open(edf['url'].iloc[0]),engine='h5netcdf')

In [63]:
ds

In [70]:
client = Client(n_workers=8, silence_logs=logging.ERROR)
client

INFO:distributed.http.proxy:To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:52806
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:8787/status
INFO:distributed.scheduler:Registering Worker plugin shuffle
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:52809'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:52810'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:52811'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:52812'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:52813'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:52814'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:52815'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:52816'
INFO:distributed.scheduler:Register wor

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 8
Total threads: 8,Total memory: 16.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:52806,Workers: 8
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:52835,Total threads: 1
Dashboard: http://127.0.0.1:52841/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:52809,
Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-g1g_i1or,Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-g1g_i1or

0,1
Comm: tcp://127.0.0.1:52827,Total threads: 1
Dashboard: http://127.0.0.1:52829/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:52810,
Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-41podwa7,Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-41podwa7

0,1
Comm: tcp://127.0.0.1:52825,Total threads: 1
Dashboard: http://127.0.0.1:52826/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:52811,
Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-j7whe3e4,Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-j7whe3e4

0,1
Comm: tcp://127.0.0.1:52831,Total threads: 1
Dashboard: http://127.0.0.1:52833/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:52812,
Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-yaogqhuf,Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-yaogqhuf

0,1
Comm: tcp://127.0.0.1:52832,Total threads: 1
Dashboard: http://127.0.0.1:52837/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:52813,
Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-vr8uj922,Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-vr8uj922

0,1
Comm: tcp://127.0.0.1:52834,Total threads: 1
Dashboard: http://127.0.0.1:52838/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:52814,
Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-a1qdk367,Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-a1qdk367

0,1
Comm: tcp://127.0.0.1:52843,Total threads: 1
Dashboard: http://127.0.0.1:52844/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:52815,
Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-i8aa5yct,Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-i8aa5yct

0,1
Comm: tcp://127.0.0.1:52846,Total threads: 1
Dashboard: http://127.0.0.1:52847/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:52816,
Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-a6sbz_ps,Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-a6sbz_ps


In [101]:
file_pattern = [edf.iloc[0]['url'], edf.iloc[-1]['url']]


In [102]:
file_pattern

['s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1f1/hurs/hurs_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950.nc',
 's3://nex-gddp-cmip6/NEX-GDDP-CMIP6/UKESM1-0-LL/ssp370/r1i1p1f2/tas/tas_day_UKESM1-0-LL_ssp370_r1i1p1f2_gn_2098.nc']

In [109]:
file_pattern

['s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1f1/hurs/hurs_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950.nc',
 's3://nex-gddp-cmip6/NEX-GDDP-CMIP6/UKESM1-0-LL/ssp370/r1i1p1f2/tas/tas_day_UKESM1-0-LL_ssp370_r1i1p1f2_gn_2098.nc']

In [104]:

fs_read = fsspec.filesystem("s3", anon=True, skip_instance_cache=True)
# This dictionary will be passed as kwargs to `fsspec`. For more details, check out the `foundations/kerchunk_basics` notebook.
so = dict(mode="rb", anon=True, default_fill_cache=False, default_cache_type="first")

# We are creating a temporary directory to store the .json reference files
# Alternately, you could write these to cloud storage.
td = TemporaryDirectory()
temp_dir = td.name
temp_dir

'/var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/tmpa74_8lgv'

In [110]:
# Use Kerchunk's `SingleHdf5ToZarr` method to create a `Kerchunk` index from a NetCDF file.
def generate_json_reference(fil, output_dir: str):
    with fs_read.open(fil, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, fil, inline_threshold=300)
        fname = fil.split("/")[-1].strip(".nc")
        outf = f"{output_dir}/{fname}.json"
        with open(outf, "wb") as f:
            f.write(ujson.dumps(h5chunks.translate()).encode())
        return outf


# Generate Dask Delayed objects
tasks = [dask.delayed(generate_json_reference)(fil, temp_dir) for fil in file_pattern]

In [111]:
# Start parallel processing
import warnings

warnings.filterwarnings("ignore")
dask.compute(tasks)

(['/var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/tmpa74_8lgv/hurs_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950.json',
  '/var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/tmpa74_8lgv/tas_day_UKESM1-0-LL_ssp370_r1i1p1f2_gn_2098.json'],)

In [112]:
output_files = glob.glob(f"{temp_dir}/*.json")

output_files

['/var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/tmpa74_8lgv/tas_day_UKESM1-0-LL_ssp370_r1i1p1f2_gn_2098.json',
 '/var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/tmpa74_8lgv/hurs_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950.json']

In [113]:
# Create a list of reference json files
output_files = glob.glob(f"{temp_dir}/*.json")

# combine individual references into single consolidated reference
mzz = MultiZarrToZarr(
    output_files,
    concat_dims=["time"],
    identical_dims=["lat", "lon"],
)
# save translate reference in memory for later visualization
multi_kerchunk = mzz.translate()

# Write kerchunk .json record
output_fname = "references/ARG_combined.json"
with open(f"{output_fname}", "wb") as f:
    f.write(ujson.dumps(multi_kerchunk).encode())

In [114]:
# create an fsspec reference filesystem from the Kerchunk output
import fsspec

fs = fsspec.filesystem(
    "reference",
    fo="references/ARG_combined.json",
    remote_protocol="s3",
    remote_options={"anon": True},
    skip_instance_cache=True,
)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr")

In [121]:
ds.isel(time=-1).tas.plot

<xarray.plot.accessor.DataArrayPlotAccessor at 0x2c2de0460>