# 09: Create compiled datasets and summary statistics
*Compile datasets from different GCMs and create summary statistics (e.g., annual maxima, days over threshold).*

In [None]:
import coiled
import dask
import xarray as xr
from utils import gcm_list, load_multimodel_results, summarize

Set up cluster to handle multiprocessing using a Dask client.

In [None]:
cluster = coiled.Cluster(
    n_workers=2,
    name="09",
    worker_vm_types=["m7g.medium"],
    scheduler_vm_types=["m7g.medium"],
    region="us-west-2",
    spot_policy="spot_with_fallback",
)

cluster.adapt(minimum=1, maximum=100)

client = cluster.get_client()

Compile all individual GCM datasets into one multimodel dataset that is optimally chunked for timeseries analysis. Create summaries for each analysis period and multimodel medians of those summary statistics.

In [None]:
rename_dict = {"wbgt-shade": "scen", "wbgt-sun": "WBGT-sun"}
# note: for central asia ext added more analysis periods
analysis_period = {
    "historical": slice("1985", "2014"),
    "ssp245-2030": slice("2020", "2039"),
    "ssp245-2050": slice("2040", "2059"),
    "ssp245-2090": slice("2080", "2099"),
    "ssp370-2030": slice("2020", "2039"),
    "ssp370-2050": slice("2040", "2059"),
    "ssp370-2090": slice("2080", "2099"),
}

scenarios = ["ssp245", "ssp370"]

In [None]:
for metric in ["wbgt-sun", "wbgt-shade"]:
    full_ds = load_multimodel_results(gcm_list[0:1], scenarios, metric)
    full_ds = full_ds.chunk({"time": -1, "processing_id": 850, "gcm": -1})
    full_ds = full_ds.rename({rename_dict[metric]: metric})
    for scenario, timeframe in analysis_period.items():
        print(scenario, timeframe)
        compiled_store = (
            "s3://carbonplan-scratch/extreme-heat-extension/v1.1/"
            + f'outputs/zarr/daily/{scenario}-WBGT-{metric.split("-")[1]}.zarr'
        )
        full_ds.sortby("time").sel(time=timeframe).to_zarr(
            compiled_store, mode="w", zarr_format=2, consolidated=True
        )
        # 1 gcm ~ 4 mins on 1 worker to this point
        ds = xr.open_zarr(compiled_store)
        summarized = summarize(ds[metric], metric.split("-")[0]).load()

        annual_medians = summarized.sel(year=timeframe).median(dim="year")
        ensemble_median = annual_medians.median(dim="gcm")
        results = xr.concat(
            [
                annual_medians,
                ensemble_median.expand_dims(dim={"gcm": ["multimodel_median"]}),
            ],
            dim="gcm",
        ).load()
        summary_store = (
            "s3://carbonplan-scratch/extreme-heat/v1.1/outputs/"
            + f'zarr/summaries/{scenario}-summaries-WBGT-{metric.split("-")[1]}.zarr'
        )
        print(summary_store)
        results = dask.optimize(results)[0]
        results.to_zarr(summary_store, mode="w", zarr_format=2, consolidated=True)

In [None]:
cluster.shutdown()

In [None]:
import xarray as xr

results = xr.open_zarr(
    "s3://carbonplan-scratch/extreme-heat/v1.1/outputs/zarr/summaries/historical-summaries-WBGT-shade.zarr",
    chunks={},
)

In [None]:
results

In [None]:
df = (
    results.sel(gcm="multimodel")
    .sel(scenario="ssp245")
    .drop("gcm")["days_exceeding_25degC"]
    .to_dataframe()
    .sort_values("processing_id")
)

In [None]:
df

In [None]:
df

In [None]:
import geopandas as gpd

path = (
    "s3://carbonplan-climate-impacts/extreme-heat/v1.0/inputs/"
    "all_regions_and_cities.json"
)
regions_df = gpd.read_file(path)
# sample_ds = xr.open_zarr(
#     "s3://carbonplan-scratch/extreme-heat/wbgt-sun-regions/wbgt-sun-ACCESS-CM2-ssp245.zarr"
# )
# regions_df = regions_df[
#     regions_df["processing_id"].isin(sample_ds.processing_id.values)
# ]
# regions_df = regions_df.drop(
#     ["UACE20", "NAMELSAD20", "gadmid", "hierid", "ISO"], axis=1
# )
regions_df

In [None]:
regions_df = regions_df[regions_df["processing_id"].isin(results.processing_id.values)]

In [None]:
df

In [None]:
regions_df["days_exceeding_25degC"] = df["days_exceeding_25degC"]

In [None]:
regions_df[["UC_NM_MN", "processing_id", "days_exceeding_25degC", "geometry"]].explore()

In [None]:
len(regions_df)