# Comparison using kerchunk / datatree

In [1]:
from datatree import DataTree
from datatree import map_over_subtree
import xarray as xr
import pandas as pd
import dask
import xclim
import thermofeel as tf
import numpy as np 
from distributed import Client
from fsspec.implementations.reference import ReferenceFileSystem
import kerchunk
import coiled
from utils import wbgt, load_elev, adjust_pressure

### TO DO: ONLY NEED SSP 245!
ds = dt['GISS-E2-1-G']['ssp245'].to_dataset()

In [None]:
# coiled.list_instance_types(backend="aws",arch="x86_64", cores=2, memory="2 Gib")



In [2]:
# Read the reference catalog into a Pandas DataFrame
cat_df = pd.read_csv(
    "s3://carbonplan-share/nasa-nex-reference/reference_catalog_nested.csv"
)
cat_df = cat_df.iloc[0:2]
# Convert the DataFrame into a dictionary
catalog = cat_df.set_index("ID").T.to_dict("records")[0]

In [3]:
import coiled

coiled.create_software_environment(
    name="kerchunk-nasa-nex",
    container="quay.io/carbonplan/kerchunk-nasa-nex",
)
cluster = coiled.Cluster(n_workers=4, worker_vm_types=['t3.small'])
client = cluster.get_client()



Output()

Output()

In [4]:
def load_ref_ds(url: str, gcm_scenario: str):
    storage_options={"remote_protocol":"s3","target_protocol":"s3", "lazy":True, "skip_instance_cache": True} # options passed to fsspec
    open_dataset_options={"chunks": {}} # opens passed to xarray
    ds = xr.open_dataset(url, engine="kerchunk", storage_options=storage_options, open_dataset_options=open_dataset_options)
    
    if set(["hurs", "huss" , "tasmax"]).issubset(set(list(ds))):
         # adding the gcm/scenario combo to attrs for later down the pipeline
        ds.attrs['gcm_scenario'] = gcm_scenario
        return ds 


tasks = {id: dask.delayed(load_ref_ds)(url, id) for id, url in catalog.items()}

In [5]:
catalog_computed = dask.compute(tasks)

In [6]:
dt = DataTree.from_dict(catalog_computed[0])
elev = load_elev()


In [7]:
def calc_wbgt(ds):
    ds = ds.to_dataset()
    ds = ds.isel(time=slice(0,365))
    # calculate elevation-adjusted pressure
    ds["ps"] = xr.apply_ufunc(
        adjust_pressure, ds["tas"], elev, dask="allowed"
    ).rename({"elevation": "ps"})["ps"]
    ds["ps"].attrs["units"] = "Pa"
    ds["hurs"] = xclim.indices.relative_humidity(
        tas=ds["tasmax"], huss=ds["huss"], ps=ds["ps"]
    )
    ds["tasmax"].attrs = {}

    # windspeed assumption of 0.5 m/s (approximating shaded/indoor
    # conditions)
    ds["sfcWind"] = (ds["tas"] - ds["tas"]) + 0.5
    ds["WBT"] = tf.thermofeel.calculate_wbt(
        ds["tasmax"] - 273.15, ds["hurs"]
    )       

    ds["BGT"] = tf.thermofeel.calculate_bgt(
        ds["tasmax"], ds["tasmax"], ds["sfcWind"]
    )
    ds["WBGT"] = wbgt(ds["WBT"], ds["BGT"], ds["tasmax"] - 273.15)
    ds["WBGT"].attrs["units"] = "degC"
    ds = ds[["WBGT"]]
    ds = dask.optimize(ds)[0]
    output = (
        f"s3://carbonplan-scratch/TEMP_NASA_NEX/wbgt-shade-"
        f"gridded/years/{ds.attrs['gcm_scenario']}.zarr"
    )

    
    return ds.to_zarr('tmp.zarr', consolidated=True, compute=False, mode="w")
    

    # ds.to_zarr('tmp.zarr', consolidated=True, mode="w")
delay_ds_list = [calc_wbgt(ds) for ds in dt.leaves]
#filter out none
delay_ds_list = [x for x in delay_ds_list if x is not None]

In [None]:
client.get_versions()

In [None]:
dask.compute(delay_ds_list)

RuntimeError: Error during deserialization of the task graph. This frequently
occurs if the Scheduler and Client have different environments.
For more information, see
https://docs.dask.org/en/stable/deployment-considerations.html#consistent-software-environments


2023-10-26 17:33:16,375 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
Traceback (most recent call last):
  File "/Users/nrhagen/micromamba/envs/nasa-nex/lib/python3.10/site-packages/distributed/comm/tcp.py", line 490, in connect
    stream = await self.client.connect(
  File "/Users/nrhagen/micromamba/envs/nasa-nex/lib/python3.10/site-packages/tornado/tcpclient.py", line 279, in connect
    af, addr, stream = await connector.start(connect_timeout=timeout)
asyncio.exceptions.CancelledError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/nrhagen/micromamba/envs/nasa-nex/lib/python3.10/asyncio/tasks.py", line 456, in wait_for
    return fut.result()
asyncio.exceptions.CancelledError

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/nrhagen/micromamba/envs/nasa-nex/lib/python3.10/site-packages/di

In [None]:
# 1 year: historical + ssp (2 datasets)
# 12 min 22 seconds (local 8 workers home)

In [None]:

# @map_over_subtree
# def calc_wbgt(ds):
#     ds = ds.isel(time=slice(0,1))
#     # calculate elevation-adjusted pressure
#     ds["ps"] = xr.apply_ufunc(
#         adjust_pressure, ds["tas"], elev, dask="allowed"
#     ).rename({"elevation": "ps"})["ps"]
#     ds["ps"].attrs["units"] = "Pa"
#     ds["hurs"] = xclim.indices.relative_humidity(
#         tas=ds["tasmax"], huss=ds["huss"], ps=ds["ps"]
#     )
#     ds["tasmax"].attrs = {}

#     # windspeed assumption of 0.5 m/s (approximating shaded/indoor
#     # conditions)
#     ds["sfcWind"] = (ds["tas"] - ds["tas"]) + 0.5
#     ds["WBT"] = tf.thermofeel.calculate_wbt(
#         ds["tasmax"] - 273.15, ds["hurs"]
#     )

#     ds["BGT"] = tf.thermofeel.calculate_bgt(
#         ds["tasmax"], ds["tasmax"], ds["sfcWind"]
#     )
#     ds["WBGT"] = wbgt(ds["WBT"], ds["BGT"], ds["tasmax"] - 273.15)
#     ds["WBGT"].attrs["units"] = "degC"
#     ds = ds[["WBGT"]]
#     ds = dask.optimize(ds)[0]
#     output = (
#         f"s3://carbonplan-scratch/TEMP_NASA_NEX/wbgt-shade-"
#         f"gridded/years/{ds.attrs['gcm_scenario']}.zarr"
#     )

    
#     # ds.to_zarr(output, consolidated=True, mode="w")
#     return ds

# delayed_datatree = calc_wbgt(dt)


In [None]:
# output = ("s3://carbonplan-scratch/TEMP_NASA_NEX/DATATREE_TEST.zarr")
# delayed_to_zarr = delayed_datatree.to_zarr(output, compute=False)



In [None]:
# delayed_obs.compute()

In [None]:
# 38 secs for single time slice and 10 gcms