# Comparison using kerchunk / datatree

In [12]:
from datatree import DataTree
from datatree import map_over_subtree
import xarray as xr
import pandas as pd
import dask
import xclim
import thermofeel as tf
import numpy as np 
from distributed import Client
from fsspec.implementations.reference import ReferenceFileSystem

from utils import wbgt, load_elev, adjust_pressure

In [13]:
# Read the reference catalog into a Pandas DataFrame
cat_df = pd.read_csv(
    "s3://carbonplan-share/nasa-nex-reference/reference_catalog_nested.csv"
)
# cat_df = cat_df.iloc[0:]
# Convert the DataFrame into a dictionary
catalog = cat_df.set_index("ID").T.to_dict("records")[0]

In [14]:
def load_ref_ds(url: str, gcm_scenario: str):
    storage_options={"remote_protocol":"s3","target_protocol":"s3", "lazy":True, "skip_instance_cache": True} # options passed to fsspec
    open_dataset_options={"chunks": {}} # opens passed to xarray
    ds = xr.open_dataset(url, engine="kerchunk", storage_options=storage_options, open_dataset_options=open_dataset_options)
    
    # adding the gcm/scenario combo to attrs for later down the pipeline
    ds.attrs['gcm_scenario'] = gcm_scenario
    return ds 


tasks = {id: dask.delayed(load_ref_ds)(url, id) for id, url in catalog.items()}

In [18]:
client = Client(n_workers=8)
client


Perhaps you already have a cluster running?
Hosting the HTTP server on port 60474 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:60474/status,

0,1
Dashboard: http://127.0.0.1:60474/status,Workers: 8
Total threads: 8,Total memory: 16.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:60475,Workers: 8
Dashboard: http://127.0.0.1:60474/status,Total threads: 8
Started: Just now,Total memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:60504,Total threads: 1
Dashboard: http://127.0.0.1:60511/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:60478,
Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-mvr6wnx6,Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-mvr6wnx6

0,1
Comm: tcp://127.0.0.1:60494,Total threads: 1
Dashboard: http://127.0.0.1:60499/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:60479,
Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-e58mq_6c,Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-e58mq_6c

0,1
Comm: tcp://127.0.0.1:60495,Total threads: 1
Dashboard: http://127.0.0.1:60496/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:60480,
Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-mwckvczy,Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-mwckvczy

0,1
Comm: tcp://127.0.0.1:60498,Total threads: 1
Dashboard: http://127.0.0.1:60505/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:60481,
Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-mhr6c6c6,Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-mhr6c6c6

0,1
Comm: tcp://127.0.0.1:60501,Total threads: 1
Dashboard: http://127.0.0.1:60507/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:60482,
Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-kfymum99,Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-kfymum99

0,1
Comm: tcp://127.0.0.1:60502,Total threads: 1
Dashboard: http://127.0.0.1:60509/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:60483,
Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-10rn07it,Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-10rn07it

0,1
Comm: tcp://127.0.0.1:60514,Total threads: 1
Dashboard: http://127.0.0.1:60516/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:60484,
Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-3vr0uky3,Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-3vr0uky3

0,1
Comm: tcp://127.0.0.1:60503,Total threads: 1
Dashboard: http://127.0.0.1:60510/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:60485,
Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-jwcao3_6,Local directory: /var/folders/mb/7d7yq_4j2qgdfm_j3j4tsyl40000gn/T/dask-scratch-space/worker-jwcao3_6


In [15]:
catalog_computed = dask.compute(tasks)

In [16]:
dt = DataTree.from_dict(catalog_computed[0])
elev = load_elev()


In [22]:


def calc_wbgt(ds):
    ds = ds.to_dataset()
    ds = ds.isel(time=slice(0,1))
    # calculate elevation-adjusted pressure
    ds["ps"] = xr.apply_ufunc(
        adjust_pressure, ds["tas"], elev, dask="allowed"
    ).rename({"elevation": "ps"})["ps"]
    ds["ps"].attrs["units"] = "Pa"
    ds["hurs"] = xclim.indices.relative_humidity(
        tas=ds["tasmax"], huss=ds["huss"], ps=ds["ps"]
    )
    ds["tasmax"].attrs = {}

    # windspeed assumption of 0.5 m/s (approximating shaded/indoor
    # conditions)
    ds["sfcWind"] = (ds["tas"] - ds["tas"]) + 0.5
    ds["WBT"] = tf.thermofeel.calculate_wbt(
        ds["tasmax"] - 273.15, ds["hurs"]
    )

    ds["BGT"] = tf.thermofeel.calculate_bgt(
        ds["tasmax"], ds["tasmax"], ds["sfcWind"]
    )
    ds["WBGT"] = wbgt(ds["WBT"], ds["BGT"], ds["tasmax"] - 273.15)
    ds["WBGT"].attrs["units"] = "degC"
    ds = ds[["WBGT"]]
    ds = dask.optimize(ds)[0]
    output = (
        f"s3://carbonplan-scratch/TEMP_NASA_NEX/wbgt-shade-"
        f"gridded/years/{ds.attrs['gcm_scenario']}.zarr"
    )

    
    ds.to_zarr(output, consolidated=True, compute=False, mode="w")
    return ds

delay_ds_list = [calc_wbgt(ds) for ds in dt.leaves]
# delayed_datatree = calc_wbgt(dt)


In [10]:
dask.compute(delay_ds_list)

([None, None],)

In [None]:
# 1 year: historical + ssp (2 datasets)
# 12 min 22 seconds (local 8 workers home)

In [None]:

# @map_over_subtree
# def calc_wbgt(ds):
#     ds = ds.isel(time=slice(0,1))
#     # calculate elevation-adjusted pressure
#     ds["ps"] = xr.apply_ufunc(
#         adjust_pressure, ds["tas"], elev, dask="allowed"
#     ).rename({"elevation": "ps"})["ps"]
#     ds["ps"].attrs["units"] = "Pa"
#     ds["hurs"] = xclim.indices.relative_humidity(
#         tas=ds["tasmax"], huss=ds["huss"], ps=ds["ps"]
#     )
#     ds["tasmax"].attrs = {}

#     # windspeed assumption of 0.5 m/s (approximating shaded/indoor
#     # conditions)
#     ds["sfcWind"] = (ds["tas"] - ds["tas"]) + 0.5
#     ds["WBT"] = tf.thermofeel.calculate_wbt(
#         ds["tasmax"] - 273.15, ds["hurs"]
#     )

#     ds["BGT"] = tf.thermofeel.calculate_bgt(
#         ds["tasmax"], ds["tasmax"], ds["sfcWind"]
#     )
#     ds["WBGT"] = wbgt(ds["WBT"], ds["BGT"], ds["tasmax"] - 273.15)
#     ds["WBGT"].attrs["units"] = "degC"
#     ds = ds[["WBGT"]]
#     ds = dask.optimize(ds)[0]
#     output = (
#         f"s3://carbonplan-scratch/TEMP_NASA_NEX/wbgt-shade-"
#         f"gridded/years/{ds.attrs['gcm_scenario']}.zarr"
#     )

    
#     # ds.to_zarr(output, consolidated=True, mode="w")
#     return ds

# delayed_datatree = calc_wbgt(dt)


In [None]:
# output = ("s3://carbonplan-scratch/TEMP_NASA_NEX/DATATREE_TEST.zarr")
# delayed_to_zarr = delayed_datatree.to_zarr(output, compute=False)



In [None]:
# delayed_obs.compute()

In [None]:
# 38 secs for single time slice and 10 gcms