# Comparison using open_mfdataset

In [None]:
import os
import pandas as pd
import numpy as np
import xarray as xr
import s3fs
import coiled
from utils import load_elev, generate_WBGT

gcm_list = [
    "ACCESS-CM2",
    "ACCESS-ESM1-5",
    "BCC-CSM2-MR",
    "CanESM5",
    "CMCC-CM2-SR5",
    "CMCC-ESM2",
    "CNRM-CM6-1",
    "CNRM-ESM2-1",
    "EC-Earth3-Veg-LR",
    "EC-Earth3",
    "FGOALS-g3",
    "GFDL-CM4",
    "GFDL-ESM4",
    "GISS-E2-1-G",
    "HadGEM3-GC31-LL",
    "INM-CM4-8",
    "INM-CM5-0",
    "KACE-1-0-G",
    "KIOST-ESM",
    "MIROC-ES2L",
    "MPI-ESM1-2-HR",
    "MPI-ESM1-2-LR",
    "MRI-ESM2-0",
    "NorESM2-LM",
    "NorESM2-MM",
    "UKESM1-0-LL",
]

gcms_with_nonstandard_calendars_list = [
    "BCC-CSM2-MR",
    "CanESM5",
    "CMCC-CM2-SR5",
    "CMCC-ESM2",
    "FGOALS-g3",
    "GFDL-CM4",
    "GFDL-ESM4",
    "GISS-E2-1-G",
    "HadGEM3-GC31-LL",
    "INM-CM4-8",
    "INM-CM5-0",
    "KACE-1-0-G",
    "KIOST-ESM",
    "NorESM2-LM",
    "NorESM2-MM",
    "UKESM1-0-LL",
]

os.environ["USE_PYGEOS"] = "0"

In [None]:
# Start coiled cluster
cluster = coiled.Cluster(n_workers=10)
client = cluster.get_client()

In [None]:
# load csv catalog
df = pd.read_csv(
    "s3://carbonplan-climate-impacts/extreme-heat/v1.0/inputs/nex-gddp-cmip6-files.csv"
)
nasa_nex_runs_df = pd.DataFrame([run.split("/") for run in df[" fileURL"].values]).drop(
    [0, 1, 2, 3], axis=1
)
nasa_nex_runs_df.columns = [
    "GCM",
    "scenario",
    "ensemble_member",
    "variable",
    "file_name",
]

In [None]:
# Utility functions to load nasa-nex NetCDF files
def find_nasanex_filename(gcm, scenario):
    """
    Load list of NASA-NEX files downloaded from their docs. We will use it to create
    the catalog of available datasets. Largely this is used to filter out the GCMs
    that don't have tasmax available.
    """
    template_filename = nasa_nex_runs_df[
        (nasa_nex_runs_df["GCM"] == gcm)
        & (nasa_nex_runs_df["scenario"] == scenario)
        & (nasa_nex_runs_df["variable"] == "tasmax")
    ]["file_name"].iloc[0]
    (
        _variable,
        _timestep,
        _gcm,
        _scenario,
        ensemble_member,
        grid_code,
        _yearnc,
    ) = template_filename.split("_")
    return ensemble_member, grid_code


def load_nasanex(scenario, gcm, variables, years, chunk_dict=None):
    """
    Read in NEX-GDDP-CMIP6 data from S3.
    """
    fs = s3fs.S3FileSystem(anon=True, default_fill_cache=False)

    file_objs = {}
    ds = xr.Dataset()
    ensemble_member, grid_code = find_nasanex_filename(gcm, scenario)
    for i, var in enumerate(variables):
        file_objs[var] = [
            fs.open(
                f"nex-gddp-cmip6/NEX-GDDP-CMIP6/{gcm}/{scenario}/"
                f"{ensemble_member}/{var}/{var}_day_{gcm}_{scenario}"
                f"_{ensemble_member}_{grid_code}_{year}.nc"
            )
            for year in years
        ]
        if i == 0:
            ds[var] = xr.open_mfdataset(file_objs[var], engine="h5netcdf")[var]
        else:
            new_var = xr.open_mfdataset(file_objs[var], engine="h5netcdf")
            new_var["time"] = ds[variables[0]]["time"].values
            ds[var] = new_var[var]
    if chunk_dict is not None:
        ds = ds.chunk(chunk_dict)
    return ds

In [None]:
scenario_years = {"historical": np.arange(1985, 1986), "ssp245": np.arange(2015, 2016)}

# for timing we are grabbing a subset of 10 GCM's x 2 scenarios
gcm_list = [
    "ACCESS-CM2",
    "ACCESS-ESM1-5",
    "BCC-CSM2-MR",
    "CanESM5",
    "CMCC-CM2-SR5",
    "CMCC-ESM2",
    "CNRM-CM6-1",
    "CNRM-ESM2-1",
    "EC-Earth3-Veg-LR",
    "EC-Earth3",
]

In [None]:
# load elevation for WBGT calc
elev = load_elev()

In [None]:
generate_wbgt_projections = True
variables = ["tasmax", "huss", "tas"]
if generate_wbgt_projections:
    for gcm in gcm_list:
        for scenario, years in scenario_years.items():
            id_string = f"{gcm}-{scenario}"
            print(id_string)
            for year in years:
                output = (
                    f"s3://carbonplan-scratch/TEMP_NASA_NEX/wbgt-shade-"
                    f"gridded/years/{gcm}/{id_string}-{year}.zarr"
                )
                ds = load_nasanex(
                    gcm=gcm, scenario=scenario, variables=variables, years=[year]
                )
                ds = ds.isel(time=slice(0, 365))
                generate_WBGT(ds=ds, output_fpath=output)