# 09: Create compiled datasets and summary statistics
*Compile datasets from different GCMs and create summary statistics (e.g., annual maxima, days over threshold, heatwave days).*

In [None]:
import logging

import dask
import fsspec
import geopandas as gpd
import pandas as pd
import xarray as xr
import xclim
from dask.distributed import Client
from utils import gcm_list, load_multimodel_results, summarize

Set up cluster to handle multiprocessing using a Dask client.

In [None]:
client = Client(threads_per_worker=1, n_workers=4, silence_logs=logging.ERROR)
client

Compile all individual GCM datasets into one multimodel dataset that is optimally chunked for timeseries analysis. Create summaries for each analysis period and multimodel medians of those summary statistics.

In [None]:
rename_dict = {"wbgt-shade": "scen", "wbgt-sun": "wbgt-sun"}
analysis_period = {
    "historical": slice("1985", "2014"),
    "ssp245-2030": slice("2020", "2039"),
    "ssp245-2050": slice("2040", "2059"),
}

In [None]:
for metric in ["wbgt-sun", "wbgt-shade"]:
    full_ds = load_multimodel_results(gcm_list, metric)
    full_ds = full_ds.chunk({"time": -1, "processing_id": 850})
    full_ds = full_ds.rename({rename_dict[metric]: metric})
    for scenario, timeframe in analysis_period.items():
        compiled_store = f's3://carbonplan-climate-impacts/extreme-heat/v1.0/"\
                f"outputs/zarr/daily/{scenario}-WBGT-{metric.split("-")[1]}.zarr'
        full_ds.sel(time=timeframe).to_zarr(compiled_store, mode="w", consolidated=True)

        ds = xr.open_zarr(compiled_store).chunk({"gcm": -1})
        summarized = summarize(ds[metric], metric.split("-")[0]).chunk({"year": -1})

        annual_medians = summarized.sel(year=timeframe).median(dim="year")
        ensemble_median = annual_medians.median(dim="gcm")
        results = xr.concat(
            [
                annual_medians,
                ensemble_median.expand_dims(dim={"gcm": ["multimodel_median"]}),
            ],
            dim="gcm",
        ).chunk({"gcm": -1})
        summary_store = f's3://carbonplan-climate-impacts/extreme-heat/v1.0/outputs/"\
                f"zarr/summaries/{scenario}-summaries-WBGT-{metric.split("-")[1]}.zarr'
        results = dask.optimize(results)[0]
        results.to_zarr(summary_store, mode="w", consolidated=True)

In [None]:
path = "s3://carbonplan-extreme-heat/inputs/all_regions_and_cities.json"
with fsspec.open(path) as file:
    regions_df = gpd.read_file(file)
sample_ds = xr.open_zarr(
    "s3://carbonplan-extreme-heat/temp/wbgt-sun-regions/wbgt-sun-ACCESS-CM2.zarr"
)
regions_df = regions_df[
    regions_df["processing_id"].isin(sample_ds.processing_id.values)
]

In [None]:
def prep_heatwaves(da, metric):
    """
    Load dataset and roll the southern hemisphere data points by 180 days to ensure
    that heatwaves are captured correctly in the southern hemisphere.
    """
    southern_hemisphere_ids = regions_df.cx[:, :0].processing_id.values
    northern_hemisphere_ids = list(
        set(regions_df.processing_id.values) - set(southern_hemisphere_ids)
    )
    # do hemispheres separately and then concatenate back together to ease computation
    selected_da_n = da.sel({"processing_id": northern_hemisphere_ids})
    selected_da_s = da.sel({"processing_id": southern_hemisphere_ids})
    selected_da_s = selected_da_s.roll({"time": 180})
    out_da = xr.concat([selected_da_n, selected_da_s], dim="processing_id")
    if metric in ["wbgt-shade", "wbgt-sun"]:
        out_da.attrs["units"] = "degC"

    return out_da.to_dataset(name=metric)

Compile all individual GCM datasets into one multimodel dataset that is optimally chunked for heatwave timeseries analysis.

In [None]:
for metric in ["wbgt-shade", "wbgt-sun"]:
    for scenario, timeframe in analysis_period.items():
        compiled_store = (
            f"s3://carbonplan-climate-impacts/extreme-heat/v1.0/outputs/"
            f"zarr/daily/{scenario}-WBGT-{metric.split('-')[1]}.zarr"
        )
        ds = xr.open_zarr(compiled_store)
        heat_wave_store = (
            f"s3://carbonplan-scratch/extreme-heat/"
            f"{metric}-compiled-for-heatwaves-{scenario}.zarr"
        )
        out_ds = prep_heatwaves(ds.sel(time=timeframe)[metric], metric)
        out_ds.chunk({"processing_id": 850}).to_zarr(heat_wave_store, mode="w")

Create heatwave summaries for each analysis period and multimodel medians of those summary statistics.

In [None]:
for metric in ["wbgt-shade", "wbgt-sun"]:
    for scenario, timeframe in analysis_period.items():
        if (metric == "wbgt-shade") & (scenario == "historical"):
            print("yay")
            continue
        print("here")
        heat_wave_store = (
            f"s3://carbonplan-scratch/extreme-heat/"
            f"{metric}-compiled-for-heatwaves-{scenario}.zarr"
        )
        da = xr.open_zarr(heat_wave_store)[metric].chunk({"processing_id": 10000})
        da["time"] = pd.date_range(
            da["time"].values[0], da["time"].values[-1], normalize=True
        )
        period_median = xr.Dataset()

        for threshold in [29, 30.5, 32, 35]:
            threshold_unit = f"{threshold} degC"
            period_median[
                f"heatwave-days-over-{threshold}degC"
            ] = xclim.indicators.atmos.heat_wave_index(
                da, window=5, freq="YS", thresh=threshold_unit
            ).median(
                dim="time"
            )
        period_median.attrs["units"] = "days_over_threshold"
        ensemble_median = period_median.median(dim="gcm")
        results = xr.concat(
            [
                period_median,
                ensemble_median.expand_dims(dim={"gcm": ["multimodel_median"]}),
            ],
            dim="gcm",
        ).chunk({"gcm": 1})
        results = dask.optimize(results)[0]
        out_file = (
            f"s3://carbonplan-climate-impacts/extreme-heat/v1.0/outputs/zarr/"
            f"summaries/{scenario}-summaries-heatwaves-WBGT-{metric.split('-')[1]}.zarr"
        )
        results.to_zarr(out_file, mode="w", consolidated=True)