## Cal-Adapt Analytics Engine Data processing script for DWR

This notebook downloads climate data for pr, tasmax, and tasmin, processes the data to monthly time step, weights each sub-basin of interest by Area and Flow and determines the average monthly value for the entire watershed. The average weighted results for Area and Flow are then processed into a monthly rolling average series for pr, tasmax, tasmin, and tasave.

The output format of processed climate data are listed below:
* Each subasin is stored stored to exported to CSV without the weights with file name format {source_id}_{experiment_id}_{member_id}_{basin_id}_19.csv.  
* The Area and Flow weighted value are exported to CSV files in seperate directories with the file name format {source_id}_{experiment_id}_{member_id}_19{weight_type}Weighted.csv.
* The monthly rolling average results are exported to CSV files in seperate diretories based on inputs weighting.  The file format is {source_id}_{member_id}_{experiment_id}_30yrAve.csv.


Note: All SSPs/realization(member_1d) must have the corrisponding historical/realization(member_1d) otherwise the 30YrAve post-proccesing will throw an error. 

Inputs listed below:
* data/Basin_Weights.csv: Constains Basin the data for the basin weighting.
* data/GCM_Run_List_#-#.csv: Constains a run list for all GCMs of interest.  Note: Each SSP must contain corrisponding historical for 30 year rolling average calc.

Known Issues:  
* When running a large run list, if you screen is locked or browser focus is lost the kernal will lose connection with Dask array.  To avoid this, you can either convert to a script and run in a terminal window like `tmux` or `screen`, or simply shorten the run list.


In [1]:
import csv
from io import StringIO
import os
import zipfile

import pandas as pd
import intake
import numpy as np
import xarray as xr
import rioxarray
import dask
import panel as pn
from dask.distributed import get_client

pn.extension()
xr.set_options(keep_attrs=True)
dask.config.set({"array.slicing.split_large_chunks": True})


<dask.config.set at 0x7fb9725b6930>

In [2]:
# VARIABLES
#Use these cordinates to clip around the watershed of interest.
bbox = {
    "maxy": 42.432494,
    "miny": 34.775317,
    "minx": -123.097421,
    "maxx": -117.980799,
}

#run_list_path = "data/GCM_Run_List_1-43.csv"
#file_zip = "GCM_1-43.zip"

run_list_path = "data/RunLists/GCM_Run_List_1-19.csv"
file_zip = "GCM_1-19.zip"

run_list_path = "data/RunLists/GCM_Run_List_20-43.csv"
file_zip = "GCM_20-43.zip"

run_list_path = "data/RunLists/GCM_Run_List_44-82.csv"
file_zip = "GCM_44-82.zip"

run_list_path = "data/RunLists/GCM_Run_List_83-100.csv"
file_zip = "GCM_83-100.zip"

run_list_path = "data/RunLists/GCM_Run_List_Add_1-22.csv"
file_zip = "GCM_Add_1-22.zip"

run_list_path = "data/RunLists/GCM_Run_List_Add_23-47.csv"
file_zip = "GCM_Add_23-47.zip"

run_list_path = "data/RunLists/GCM_Run_List_Add_48-75.csv"
file_zip = "GCM_Add_48-75.zip"

#run_list_path = "data/GCM_Run_List_Test_Rolling.csv"
#file_zip = "GCM_Rolling_Test.zip"

basin_weights_csv = "data/Basin_Weights.csv"
esm_datastore = "https://cadcat.s3.amazonaws.com/cae-collection.json"
output_folder = "outputs"
mask_path = "mask/mask.npy"
dir_area_weighted = 'AREA_WEIGHTED_CENTRALVALLEY'
dir_flow_weighted = 'FLOW_WEIGHTED_CENTRALVALLEY'
dir_non_weighted = 'NON_WEIGHTED_CENTRALVALLEY'
dir_individual = 'INDIVIDUAL_BASIN_LOCA2'
dir_area_weighted_rolling = 'AREA_WEIGHTED_30_YEAR_ROLLING_AVE_CENTRALVALLEY'
dir_flow_weighted_rolling = 'FLOW_WEIGHTED_30_YEAR_ROLLING_AVE_CENTRALVALLEY'


This loads the dask client for faster processing.  If client is running, do not run this cell.

In [3]:
#Load dask Area for faster computing. 
#Note, this will take some time but in long run processing should be faster when compute is called.
# Check if there is already a dask client running
try:
    client = get_client()
    print("Using existing dask client")
except ValueError:
    # No client found, create a new one
    cluster = dask.distributed.LocalCluster(
        n_workers=16,  # Adjust the number of workers as needed
        threads_per_worker=1,  # Use one thread per worker
        memory_limit='2GB'  # Adjust memory limit per worker as needed
    )
    cluster.adapt(minimum=0, maximum=16)
    client = cluster.get_client()
    print("Created new dask client")

# Get client link
client

Created new dask client


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 16
Total threads: 16,Total memory: 29.80 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:45569,Workers: 0
Dashboard: http://127.0.0.1:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B

0,1
Comm: tcp://127.0.0.1:33895,Total threads: 1
Dashboard: http://127.0.0.1:43257/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:43193,
Local directory: /tmp/dask-scratch-space/worker-roo7hmbe,Local directory: /tmp/dask-scratch-space/worker-roo7hmbe

0,1
Comm: tcp://127.0.0.1:33513,Total threads: 1
Dashboard: http://127.0.0.1:38191/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:39713,
Local directory: /tmp/dask-scratch-space/worker-q4xqvjgk,Local directory: /tmp/dask-scratch-space/worker-q4xqvjgk

0,1
Comm: tcp://127.0.0.1:42253,Total threads: 1
Dashboard: http://127.0.0.1:41989/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:40581,
Local directory: /tmp/dask-scratch-space/worker-1q6pqmko,Local directory: /tmp/dask-scratch-space/worker-1q6pqmko

0,1
Comm: tcp://127.0.0.1:37351,Total threads: 1
Dashboard: http://127.0.0.1:40765/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:35281,
Local directory: /tmp/dask-scratch-space/worker-9x7f7x9z,Local directory: /tmp/dask-scratch-space/worker-9x7f7x9z

0,1
Comm: tcp://127.0.0.1:34029,Total threads: 1
Dashboard: http://127.0.0.1:41099/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:37633,
Local directory: /tmp/dask-scratch-space/worker-3zpx141q,Local directory: /tmp/dask-scratch-space/worker-3zpx141q

0,1
Comm: tcp://127.0.0.1:42109,Total threads: 1
Dashboard: http://127.0.0.1:36441/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:37181,
Local directory: /tmp/dask-scratch-space/worker-f_8aplok,Local directory: /tmp/dask-scratch-space/worker-f_8aplok

0,1
Comm: tcp://127.0.0.1:40195,Total threads: 1
Dashboard: http://127.0.0.1:40543/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:44661,
Local directory: /tmp/dask-scratch-space/worker-_g0_abiz,Local directory: /tmp/dask-scratch-space/worker-_g0_abiz

0,1
Comm: tcp://127.0.0.1:45681,Total threads: 1
Dashboard: http://127.0.0.1:39725/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:36201,
Local directory: /tmp/dask-scratch-space/worker-0zyzaw5w,Local directory: /tmp/dask-scratch-space/worker-0zyzaw5w

0,1
Comm: tcp://127.0.0.1:45047,Total threads: 1
Dashboard: http://127.0.0.1:44179/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:46193,
Local directory: /tmp/dask-scratch-space/worker-unqb65_8,Local directory: /tmp/dask-scratch-space/worker-unqb65_8

0,1
Comm: tcp://127.0.0.1:35565,Total threads: 1
Dashboard: http://127.0.0.1:37121/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:39157,
Local directory: /tmp/dask-scratch-space/worker-zeid_nb8,Local directory: /tmp/dask-scratch-space/worker-zeid_nb8

0,1
Comm: tcp://127.0.0.1:41845,Total threads: 1
Dashboard: http://127.0.0.1:44875/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:41827,
Local directory: /tmp/dask-scratch-space/worker-a__pn_yd,Local directory: /tmp/dask-scratch-space/worker-a__pn_yd

0,1
Comm: tcp://127.0.0.1:35741,Total threads: 1
Dashboard: http://127.0.0.1:36853/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:41325,
Local directory: /tmp/dask-scratch-space/worker-5cor1oxz,Local directory: /tmp/dask-scratch-space/worker-5cor1oxz

0,1
Comm: tcp://127.0.0.1:35219,Total threads: 1
Dashboard: http://127.0.0.1:39237/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:37959,
Local directory: /tmp/dask-scratch-space/worker-tnnjo96f,Local directory: /tmp/dask-scratch-space/worker-tnnjo96f

0,1
Comm: tcp://127.0.0.1:38787,Total threads: 1
Dashboard: http://127.0.0.1:33881/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:44171,
Local directory: /tmp/dask-scratch-space/worker-40czmhbg,Local directory: /tmp/dask-scratch-space/worker-40czmhbg

0,1
Comm: tcp://127.0.0.1:33695,Total threads: 1
Dashboard: http://127.0.0.1:39019/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:38847,
Local directory: /tmp/dask-scratch-space/worker-ta64623k,Local directory: /tmp/dask-scratch-space/worker-ta64623k

0,1
Comm: tcp://127.0.0.1:33459,Total threads: 1
Dashboard: http://127.0.0.1:38089/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:42427,
Local directory: /tmp/dask-scratch-space/worker-9m3475lr,Local directory: /tmp/dask-scratch-space/worker-9m3475lr


2025-08-05 12:51:52,439 - tornado.application - ERROR - Exception in callback functools.partial(<function TCPServer._handle_connection.<locals>.<lambda> at 0x7fb89a41b740>, <Task finished name='Task-75112' coro=<BaseTCPListener._handle_stream() done, defined at /srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/comm/tcp.py:655> exception=ValueError('invalid operation on non-started TCPListener')>)
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/tornado/ioloop.py", line 758, in _run_callback
    ret = callback()
          ^^^^^^^^^^
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/tornado/tcpserver.py", line 387, in <lambda>
    gen.convert_yielded(future), lambda f: f.result()
                                           ^^^^^^^^^^
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/comm/tcp.py", line 661, in _handle_stream
    logger.debug("Incoming connection from %r to %r", address, self

In [4]:
def get_region_dict() -> dict:
    """
    Gets flow and area weights for each region from the basin weights CSV file.

    Returns
    -------
    dict
        Dictionary with region ID as key and dictionary containing region_name,
        flow_ratio, and area_ratio as values.
    """
    region_dict = {}
    with open(basin_weights_csv, "r") as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            region_dict[int(row["ID"])] = {
                                "region_name": row["Regions"],
                                "flow_ratio": float(row["Flow Ratio"]),
                                "area_ratio": float(row["Area Ratio"]),
                            }
    return region_dict

In [5]:
def get_model_params(run_list_path: str) -> list[dict]:
    """
    Read each set of model parameters into dictionary from csv file.
    
    Parameters
    ----------
    run_list_path : str
        Path to CSV file containing model parameters with columns for
        activity_id, institution_id, source_id, experiment_id, table_id,
        grid_label, member_id, start_year, and end_year.
    
    Returns
    -------
    list[dict]
        List of dictionaries where each dictionary contains model parameters
        for a single model run with keys corresponding to CSV column headers.
    
    Examples
    --------
    >>> params = get_model_params("data/GCM_Run_List_1-19.csv")
    >>> len(params)
    19
    >>> params[0].keys()
    dict_keys(['activity_id', 'institution_id', 'source_id', ...])
    """
    model_params = []
    with open(run_list_path, "r") as src:
        d = csv.DictReader(src)
        for row in d:
            model_params.append(row)
    return model_params

In [6]:
def slice_by_time_years_dataset(ds: xr.Dataset, startyear: str, endyear: str) -> xr.Dataset:
    """
    Slice the dataset to years of interest.
    
    Parameters
    ----------
    ds : xarray.Dataset
        Input dataset containing time dimension to be sliced
    startyear : str
        Start year for slicing in string format (e.g., '1950')
    endyear : str
        End year for slicing in string format (e.g., '2014')
    
    Returns
    -------
    xarray.Dataset
        Dataset sliced to the specified time range
    
    Examples
    --------
    >>> ds = xr.Dataset({'temp': (['time'], np.random.rand(100))})
    >>> sliced_ds = slice_by_time_years_dataset(ds, '1950', '2014')
    >>> # Returns dataset with time dimension sliced from 1950 to 2014
    """
    ds = ds.sel(
        time=slice(str(startyear), str(endyear))
        )
    return ds

In [7]:
def convert_daily_to_monthly_dataset(ds: xr.Dataset) -> xr.Dataset:
    """
    Convert daily climate data to monthly aggregated values.
    
    Converts daily precipitation from kg/m2/s to monthly accumulated mm,
    and daily temperature from Kelvin to monthly average Celsius.
    
    Parameters
    ----------
    ds : xarray.Dataset
        Input dataset containing daily climate data with variables:
        - pr: precipitation in kg/m2/s
        - tasmin: minimum temperature in Kelvin
        - tasmax: maximum temperature in Kelvin
    
    Returns
    -------
    xarray.Dataset
        Dataset with monthly aggregated climate data:
        - pr: monthly accumulated precipitation in mm/mon
        - tasmin: monthly average minimum temperature in degC
        - tasmax: monthly average maximum temperature in degC
    
    Notes
    -----
    Precipitation conversion: kg/m2/s * 86400 s/day = mm/day
    Temperature conversion: K - 273.15 = degC
    
    Examples
    --------
    >>> daily_ds = xr.Dataset({
    ...     'pr': (['time', 'lat', 'lon'], np.random.rand(365, 10, 10)),
    ...     'tasmin': (['time', 'lat', 'lon'], np.random.rand(365, 10, 10) + 273.15),
    ...     'tasmax': (['time', 'lat', 'lon'], np.random.rand(365, 10, 10) + 283.15)
    ... })
    >>> monthly_ds = convert_daily_to_monthly_dataset(daily_ds)
    >>> monthly_ds.pr.attrs['units']
    'mm/mon'
    >>> monthly_ds.tasmin.attrs['units']
    'degC'
    """
    # Convert daily precipitation from kg/m2/s to mm/day
    ds['pr'] = ds.pr * 86400
    ds.pr.attrs["units"] = 'mm/day' 
    
    # Resample precipitation to monthly sum (accumulated)
    ds_precip = ds['pr'].resample(time="M").sum()
    ds_precip.attrs["units"] = 'mm/mon' 
    
    # Resample temperature to monthly mean (average)
    ds_temp = ds[['tasmin','tasmax']].resample(time="M").mean()
    
    # Convert temperature from Kelvin to Celsius
    ds_temp = ds_temp[['tasmin','tasmax']] - 273.15
    ds_temp.tasmin.attrs["units"] = 'degC'
    ds_temp.tasmax.attrs["units"] = 'degC'
        
    # Merge precipitation and temperature datasets
    ds = xr.merge([ds_precip, ds_temp])
    return ds

In [8]:
def get_dataset(esm_datastore: str, model_params: dict) -> xr.Dataset:
  """
  Return xarray.DataSet from model parameters.
  
  Parameters
  ----------
  esm_datastore : str
    URL or path to the ESM datastore catalog
  model_params : dict
    Dictionary containing model parameters with keys:
    activity_id, institution_id, table_id, experiment_id, 
    grid_label, member_id, source_id, start_year, end_year
  
  Returns
  -------
  xarray.Dataset
    Dataset containing precipitation (pr), maximum temperature (tasmax),
    and minimum temperature (tasmin) data converted to monthly time step
    and sliced to the specified time window
  
  Examples
  --------
  >>> params = {
  ...     'activity_id': 'CMIP',
  ...     'institution_id': 'NCAR',
  ...     'table_id': 'day',
  ...     'experiment_id': 'historical',
  ...     'grid_label': 'gn',
  ...     'member_id': 'r1i1p1f1',
  ...     'source_id': 'CESM2',
  ...     'start_year': '1950',
  ...     'end_year': '2014'
  ... }
  >>> ds = get_dataset("https://cadcat.s3.amazonaws.com/cae-collection.json", params)
  >>> list(ds.data_vars)
  ['pr', 'tasmax', 'tasmin']
  """
  # Open catalog of available data sets using intake-esm package
  cat = intake.open_esm_datastore(esm_datastore)
  cat_item = cat.search(
    activity_id=model_params["activity_id"],
    institution_id=model_params["institution_id"],
    table_id=model_params["table_id"], 
    variable_id=['pr','tasmax','tasmin'],
    experiment_id=model_params["experiment_id"],
    grid_label=model_params["grid_label"],
    member_id=model_params["member_id"],
    source_id=model_params["source_id"],  
  )
  
  # Add catalog item to dataset dict
  data_dict = cat_item.to_dataset_dict(
    #  xarray_open_kwargs={'consolidated': True},
    storage_options={'anon': True}
  )
  
  # Construct dataset key to retrieve from the dictionary
  key = "{}.{}.{}.{}.{}.{}".format(
      model_params['activity_id'],
      model_params['institution_id'],
      model_params['source_id'],
      model_params['experiment_id'],
      model_params['table_id'],
      model_params['grid_label'],)
  
  # Slice the dataset to the input time window.
  ds = slice_by_time_years_dataset(data_dict[key],model_params['start_year'],model_params['end_year'])
  ds = convert_daily_to_monthly_dataset(ds)
  return ds

In [9]:
def add_mask_to_dataset(mask_path: str, ds: xr.Dataset) -> xr.Dataset:
    """
    Attach the mask to input dataset ds.
    
    Parameters
    ----------
    mask_path : str
        Path to the numpy mask file (.npy format)
    ds : xarray.Dataset
        Input dataset to which the mask will be attached
    
    Returns
    -------
    xarray.Dataset
        Dataset with mask coordinate added
    
    Examples
    --------
    >>> ds = xr.Dataset({'temp': (['lat', 'lon'], np.random.rand(10, 10))})
    >>> masked_ds = add_mask_to_dataset('mask/mask.npy', ds)
    >>> 'mask' in masked_ds.coords
    True
    """
    with open(mask_path, 'rb') as f:
        mask = np.load(f, allow_pickle=True)
    ds.coords['mask'] = (('lat', 'lon'), mask)
    return ds

In [10]:
def trim_dataset_to_bbox(ds: xr.Dataset, bbox: dict) -> xr.Dataset:
    """
    Clip the dataset to a bounding box.
    
    This function sets the spatial dimensions and coordinate reference system
    for the dataset, then clips it to the specified bounding box coordinates.
    
    Parameters
    ----------
    ds : xr.Dataset
        Input xarray dataset containing spatial data with 'lon' and 'lat' dimensions
    bbox : dict
        Dictionary containing bounding box coordinates with keys:
        - 'minx': minimum longitude
        - 'miny': minimum latitude  
        - 'maxx': maximum longitude
        - 'maxy': maximum latitude
    
    Returns
    -------
    xr.Dataset
        Dataset clipped to the specified bounding box
    
    Examples
    --------
    >>> bbox = {
    ...     "maxy": 42.432494,
    ...     "miny": 34.775317,
    ...     "minx": -123.097421,
    ...     "maxx": -117.980799,
    ... }
    >>> clipped_ds = trim_dataset_to_bbox(ds, bbox)
    >>> # Returns dataset clipped to the watershed of interest
    """
    # This needs to be done for the clipping.
    ds.rio.set_spatial_dims(x_dim="lon", y_dim="lat", inplace=True)
    ds.rio.write_crs("EPSG:4326", inplace=True)

    # Get the subset of data for watershed.
    ds = ds.rio.clip_box(
        minx=bbox["minx"],
        miny=bbox["miny"],
        maxx=bbox["maxx"],
        maxy=bbox["maxy"],
    )
    return ds

In [11]:
def get_output_file_name_monthly(model_params: dict, end_part: str) -> str:
    """
    Format the output file name from model parameters.
    
    Parameters
    ----------
    model_params : dict
        Dictionary containing model parameters with keys:
        - source_id : str
            Source identifier for the model
        - experiment_id : str
            Experiment identifier (e.g., 'historical', 'ssp245')
        - member_id : str
            Member identifier (e.g., 'r1i1p1f1')
    end_part : str
        String to append at the end of the filename before the .csv extension
        
    Returns
    -------
    str
        Formatted filename in the format: {source_id}_{experiment_id}_{member_id}_{end_part}.csv
        
    Examples
    --------
    >>> params = {
    ...     'source_id': 'CESM2',
    ...     'experiment_id': 'historical',
    ...     'member_id': 'r1i1p1f1'
    ... }
    >>> get_output_file_name_monthly(params, '19FlowWeighted')
    'CESM2_historical_r1i1p1f1_19FlowWeighted.csv'
    """
    return '%s_%s_%s_%s.csv'%(model_params['source_id'],model_params['experiment_id'],model_params['member_id'],end_part)

In [12]:
def load_dataset_with_mask(esm_datastore_in: str, model_params_in: dict, mask_path: str, bbox: dict) -> xr.Dataset:
    """
    Load dataset, add mask, and trim to bounding box around area of interest.
    
    This function combines the dataset loading, mask addition, and spatial trimming
    operations into a single convenience function for processing climate data.
    
    Parameters
    ----------
    esm_datastore_in : str
        URL or path to the ESM datastore catalog
    model_params_in : dict
        Dictionary containing model parameters with keys:
        activity_id, institution_id, table_id, experiment_id, 
        grid_label, member_id, source_id, start_year, end_year
    mask_path : str
        Path to the numpy mask file (.npy format) containing regional identifiers
    bbox : dict
        Dictionary containing bounding box coordinates with keys:
        - 'minx': minimum longitude
        - 'miny': minimum latitude  
        - 'maxx': maximum longitude
        - 'maxy': maximum latitude
    
    Returns
    -------
    xr.Dataset
        Processed dataset with mask coordinate added and spatially trimmed to
        the specified bounding box, containing monthly climate data for
        precipitation (pr), maximum temperature (tasmax), and minimum 
        temperature (tasmin)
    
    Examples
    --------
    >>> params = {
    ...     'activity_id': 'CMIP',
    ...     'source_id': 'CESM2',
    ...     'experiment_id': 'historical',
    ...     'start_year': '1950',
    ...     'end_year': '2014'
    ... }
    >>> bbox = {
    ...     "minx": -123.097421,
    ...     "miny": 34.775317,
    ...     "maxx": -117.980799,
    ...     "maxy": 42.432494
    ... }
    >>> ds = load_dataset_with_mask(esm_datastore, params, 'mask/mask.npy', bbox)
    >>> 'mask' in ds.coords
    True
    """
    ds = get_dataset(esm_datastore_in, model_params_in)
    ds = add_mask_to_dataset(mask_path, ds)
    ds = trim_dataset_to_bbox(ds, bbox)
    return ds
    

In [13]:
def get_df_map_mask(id_region: int, ds: xr.Dataset, use_full_mask: bool = False) -> pd.DataFrame:
    """
    Returns a dataframe for the specified region ID from the masked dataset.
    
    This function extracts climate data for a specific region based on the mask values,
    computes spatial averages, and formats the results into a pandas DataFrame with
    monthly climate statistics.
    
    Parameters
    ----------
    id_region : int
        Region identifier to extract from the mask. Set to -1 and use_full_mask=True
        to get the entire domain excluding the specified region.
    ds : xarray.Dataset
        Input dataset containing climate variables (pr, tasmin, tasmax) and a mask
        coordinate with regional identifiers.
    use_full_mask : bool, optional
        If True, extracts data where mask != id_region (excludes the region).
        If False, extracts data where mask == id_region (includes only the region).
        Default is False.
    
    Returns
    -------
    pandas.DataFrame
        DataFrame containing monthly climate data with columns:
        - time: datetime index reset as column
        - Year: year extracted from time index
        - Month: month number (1-12)
        - Pr (mm): monthly precipitation in mm/mon
        - Tasmax (degC): monthly maximum temperature in degC
        - Tasmin (degC): monthly minimum temperature in degC
        - Tave (degC): monthly average temperature in degC
    
    Notes
    -----
    The function computes spatial averages over latitude and longitude dimensions
    for each time step, skipping NaN values. Temperature average is calculated
    as the mean of tasmax and tasmin.
    
    Examples
    --------
    >>> # Extract data for region 5
    >>> df = get_df_map_mask(5, ds)
    >>> df.columns.tolist()
    ['time', 'Year', 'Month', 'Pr (mm)', 'Tasmax (degC)', 'Tasmin (degC)', 'Tave (degC)']
    
    >>> # Extract data for entire domain excluding region 5
    >>> df_full = get_df_map_mask(5, ds, use_full_mask=True)
    """
    if use_full_mask:
        map_data = ds.where(ds.mask != id_region)
    else:
        map_data = ds.where(ds.mask == id_region)

    results_precip = map_data.pr.mean(['lat','lon'],skipna=True)
    results_precip.attrs["units"]  = 'mm/mon'

    results_tasmin = map_data.tasmin.mean(['lat','lon'],skipna=True)
    results_tasmin.attrs["units"]  = 'degC'

    results_tasmax = map_data.tasmax.mean(['lat','lon'],skipna=True)
    results_tasmax.attrs["units"]  = 'degC'

    ds_all= xr.merge([results_precip,results_tasmax,results_tasmin])
    df = ds_all.to_dataframe().reset_index()

    df.drop('spatial_ref',axis=1, inplace=True)

    df['Year'] = df['time'].dt.strftime('%Y')
    df['Month'] = df['time'].dt.month
    df['Tave (degC)'] = df[['tasmax','tasmin']].mean(axis=1)

    df.rename({'pr': 'Pr (mm)','tasmax': 'Tasmax (degC)','tasmin' : 'Tasmin (degC)'}, axis=1,inplace=True)

    df_r = df.iloc[:,[3,4,7,0,1,2,5,6]]
    df_n = df_r.reset_index()

    return df_n

In [14]:
def get_weighted_dataframe(df_in: pd.DataFrame, weighting_factor: float) -> pd.DataFrame:
    """
    Computes the weighted dataframe from df_in and returns the resulting dataframe.
    
    This function applies a weighting factor to climate variables in the input
    dataframe by multiplying precipitation and temperature values by the specified
    weighting factor.
    
    Parameters
    ----------
    df_in : pd.DataFrame
        Input dataframe containing climate data with columns:
        - 'Pr (mm)': precipitation in mm/mon
        - 'Tasmax (degC)': maximum temperature in degC
        - 'Tasmin (degC)': minimum temperature in degC
        - 'Tave (degC)': average temperature in degC
    weighting_factor : float
        Multiplicative factor to apply to climate variables. Typically represents
        area ratio or flow ratio for basin weighting calculations.
    
    Returns
    -------
    pd.DataFrame
        Dataframe with weighted climate values. All climate variables are
        multiplied by the weighting factor while preserving other columns.
    
    Notes
    -----
    The function modifies the input dataframe in-place and returns it.
    For area-weighted calculations, weighting_factor represents the area ratio
    of each sub-basin. For flow-weighted calculations, it represents the flow ratio.
    
    Examples
    --------
    >>> df = pd.DataFrame({
    ...     'Pr (mm)': [100, 150, 200],
    ...     'Tasmax (degC)': [25, 30, 35],
    ...     'Tasmin (degC)': [10, 15, 20],
    ...     'Tave (degC)': [17.5, 22.5, 27.5]
    ... })
    >>> weighted_df = get_weighted_dataframe(df, 0.3)
    >>> weighted_df['Pr (mm)'].tolist()
    [30.0, 45.0, 60.0]
    """
    df_in['Pr (mm)'] = df_in['Pr (mm)'] * weighting_factor
    df_in['Tasmax (degC)'] = df_in['Tasmax (degC)'] * weighting_factor
    df_in['Tasmin (degC)'] = df_in['Tasmin (degC)'] * weighting_factor
    df_in['Tave (degC)'] = df_in['Tave (degC)'] * weighting_factor
    return df_in

In [15]:
def get_sum_dataframes(df_in: pd.DataFrame, df_to_add: pd.DataFrame) -> pd.DataFrame:
    """
    Adds the weighted results from df_to_add to the df_in and returns the resulting dataframe.
    
    This function performs element-wise addition of climate variables between two dataframes,
    filling missing values with 0. It's used to aggregate weighted climate data across
    multiple regions or time periods.
    
    Parameters
    ----------
    df_in : pd.DataFrame
        Base dataframe containing climate data with columns:
        - 'Pr (mm)': precipitation in mm/mon
        - 'Tasmax (degC)': maximum temperature in degC
        - 'Tasmin (degC)': minimum temperature in degC
        - 'Tave (degC)': average temperature in degC
    df_to_add : pd.DataFrame
        Dataframe to add to df_in, containing the same climate columns
        
    Returns
    -------
    pd.DataFrame
        Dataframe with summed climate values. The function modifies df_in in-place
        and returns it with climate variables added from df_to_add.
        
    Notes
    -----
    The function uses pandas.Series.add() with fill_value=0 to handle missing values
    during the addition operation. This ensures that NaN values are treated as 0
    in the summation.
    
    Examples
    --------
    >>> df1 = pd.DataFrame({
    ...     'Pr (mm)': [100, 150, 200],
    ...     'Tasmax (degC)': [25, 30, 35],
    ...     'Tasmin (degC)': [10, 15, 20],
    ...     'Tave (degC)': [17.5, 22.5, 27.5]
    ... })
    >>> df2 = pd.DataFrame({
    ...     'Pr (mm)': [50, 75, 100],
    ...     'Tasmax (degC)': [5, 7, 10],
    ...     'Tasmin (degC)': [2, 3, 5],
    ...     'Tave (degC)': [3.5, 5, 7.5]
    ... })
    >>> result = get_sum_dataframes(df1, df2)
    >>> result['Pr (mm)'].tolist()
    [150.0, 225.0, 300.0]
    """
    df_in['Pr (mm)'] = df_in['Pr (mm)'].add(df_to_add['Pr (mm)'], fill_value=0)
    df_in['Tasmax (degC)'] = df_in['Tasmax (degC)'].add(df_to_add['Tasmax (degC)'], fill_value=0)
    df_in['Tasmin (degC)'] = df_in['Tasmin (degC)'].add(df_to_add['Tasmin (degC)'], fill_value=0)
    df_in['Tave (degC)'] = df_in['Tave (degC)'].add(df_to_add['Tave (degC)'], fill_value=0)
    return df_in

In [16]:
def get_monthly_rolling_ave(dict_df_weighted_all: dict, average_over_years: int, append_name: str) -> dict:
    """
    Calculate rolling average for each SSP (Shared Socioeconomic Pathway) scenario.
    
    This function processes historical and SSP climate data to compute rolling averages
    over a specified number of years. It matches historical data with corresponding
    SSP scenarios and calculates monthly averages for each rolling window.
    
    Parameters
    ----------
    dict_df_weighted_all : dict
        Dictionary with filename keys and weighted dataframe values containing
        climate data. Filenames should follow format: 
        {source_id}_{experiment_id}_{member_id}_{suffix}.csv
    average_over_years : int
        Number of years to average over for rolling calculation (typically 30)
    append_name : str
        String to append to output filenames (e.g., '30yrAve')
        
    Returns
    -------
    dict
        Dictionary with filename keys and rolling average dataframe values.
        Output filenames follow format: {source_id}_{member_id}_{ssp_experiment}_{append_name}.csv
        Each dataframe contains columns:
        - 'Year Range': string showing start-end years of rolling window
        - 'Year (30y start)': start year of rolling window
        - 'Month': month number (1-12)
        - 'Pr (mm)': monthly average precipitation
        - 'Tasmax (degC)': monthly average maximum temperature
        - 'Tasmin (degC)': monthly average minimum temperature
        - 'Tave (degC)': monthly average temperature
        
    Notes
    -----
    The function expects historical data to be paired with SSP scenarios for the
    same GCM and realization. Rolling averages are calculated from 1950 to 2072
    using the specified averaging window.
    
    Examples
    --------
    >>> weighted_data = {
    ...     'CESM2_historical_r1i1p1f1_19FlowWeighted.csv': df_hist,
    ...     'CESM2_ssp245_r1i1p1f1_19FlowWeighted.csv': df_ssp
    ... }
    >>> rolling_dict = get_monthly_rolling_ave(weighted_data, 30, '30yrAve')
    >>> list(rolling_dict.keys())
    ['CESM2_r1i1p1f1_ssp245_30yrAve.csv']
    """
    dict_fil_rolling_df = {}  # Dict with filename keys and rolling average dataframe as value.
    dict_gcm_hist_realization = {} # Dict with Historical dataframes.
    dict_gcm_other_realization = {} # Dict with SSP dataframes.
    
    #Key is the formated file name and value is dataframe.
    for key_file, value_df in dict_df_weighted_all.items():
        lst_file_parts = key_file.split('_')
        if 'historical' in key_file:
            if not lst_file_parts[0] in dict_gcm_hist_realization:
                dict_gcm_hist_realization[lst_file_parts[0]] = {}
            if not lst_file_parts[1] in dict_gcm_hist_realization[lst_file_parts[0]]:
                dict_gcm_hist_realization[lst_file_parts[0]][lst_file_parts[1]] = {}
            dict_gcm_hist_realization[lst_file_parts[0]][lst_file_parts[1]][lst_file_parts[2]] = value_df
        else:
            if not lst_file_parts[0] in dict_gcm_other_realization:
                dict_gcm_other_realization[lst_file_parts[0]] = {}
            if not lst_file_parts[1] in dict_gcm_other_realization[lst_file_parts[0]]:
                dict_gcm_other_realization[lst_file_parts[0]][lst_file_parts[1]] = {}
            dict_gcm_other_realization[lst_file_parts[0]][lst_file_parts[1]][lst_file_parts[2]]=value_df      
    
    # Do rolling average and output with dictionary key as filename and values as rolling average dataframe.
    df_rolling = pd.DataFrame
    start_year = 1950
    end_year = 2072
    for key_gcm in dict_gcm_hist_realization:
        for key_ssp in dict_gcm_hist_realization[key_gcm]:
            for key_relization in dict_gcm_hist_realization[key_gcm][key_ssp]:
                df_history = dict_gcm_hist_realization[key_gcm][key_ssp][key_relization]
                for key_ssp_other in dict_gcm_other_realization[key_gcm]:
                    if not key_relization in dict_gcm_other_realization[key_gcm][key_ssp_other]: continue
                    df_ssp = dict_gcm_other_realization[key_gcm][key_ssp_other][key_relization]
                    df_out = None
                    df_rolling = pd.concat([df_history,df_ssp], axis=0)
                    df_rolling.drop('time' , axis=1, inplace=True)  
                    for currentYear in range(start_year,end_year):
                        year_30 = currentYear + average_over_years
                        df30year = df_rolling[(df_rolling['Year'].astype(int) >= currentYear) & (df_rolling['Year'].astype(int) < year_30)]
                        dftemp = df30year.groupby(df30year.Month, as_index=False, sort=True)[['Pr (mm)','Tasmax (degC)','Tasmin (degC)','Tave (degC)']].mean().reset_index()
                        dftemp.insert(loc=0,column="Year (30y start)",value=currentYear)
                        dftemp.insert(loc=0,column="Year Range",value='%s-%s'%(currentYear,year_30-1))
                        if df_out is None:
                            df_out = dftemp.copy(deep=True)
                        else:
                            df_out = pd.concat([df_out,dftemp], axis=0)
                    df_out.drop('index', axis=1, inplace=True) 
                    file_out = '%s_%s_%s_%s.csv'%(key_gcm,key_relization,key_ssp_other,append_name)
                    dict_fil_rolling_df[file_out] = df_out
    return dict_fil_rolling_df

The loop goes through all GCMs and writes the individual subbasin, the area weighted, and the flow weighted to results dictionaries with filname as the key and value equal to result dataframe. 

In [17]:
region_dict = get_region_dict()
# main loop
all_model_params = get_model_params(run_list_path)

#Defile output dicts.
results_dict = {}
flow_weighted_results_dict = {}
area_weighted_basin_results_dict = {}
for model_params in all_model_params:
    #Add masking to the dataset.
    ds = load_dataset_with_mask(esm_datastore, model_params, mask_path,bbox)
   
    key = "{}.{}.{}.{}.{}.{}".format(
        model_params['activity_id'],
        model_params['institution_id'],
        model_params['source_id'],
        model_params['experiment_id'],
        model_params['table_id'],
        model_params['grid_label'],)
    #Force load the dataset.
    print('Loading: %s'%key)
    ds = ds.compute()
        
    df_w = None
    df_a = None
    df_nw = None
    for id_region, v in region_dict.items():     
        # Get this regions results
        df_n = get_df_map_mask(id_region,ds)
        output_filename = get_output_file_name_monthly(model_params, '%s-19'%'{:02d}'.format(id_region))

        df_out = df_n.drop('time' , axis=1)
        results_dict[output_filename] = df_out
                
        # Get Area Weighted dataframe
        weighting_factor = v['area_ratio']
        df_weighted_a = get_weighted_dataframe(df_n.copy(deep=True),weighting_factor)
        if df_a is None:
            df_a = df_weighted_a.copy(deep=True)
        else:
            df_a = get_sum_dataframes(df_a,df_weighted_a)
            
        # Get Flow Weighted results 
        weighting_factor = v['flow_ratio']
        df_weighted = get_weighted_dataframe(df_n.copy(deep=True),weighting_factor)
        if df_w is None:
            df_w = df_weighted.copy(deep=True)
        else:
            df_w = get_sum_dataframes(df_w,df_weighted)
    print('Processed %s...'%get_output_file_name_monthly(model_params,'').replace('.csv',''))       
    
    #Add weighted dataframes to output.
    output_filename = get_output_file_name_monthly(model_params, "19FlowWeighted")
    flow_weighted_results_dict[output_filename] = df_w
    output_filename = get_output_file_name_monthly(model_params, "19AreaWeighted")
    area_weighted_basin_results_dict[output_filename] = df_a


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


2025-08-05 12:50:01,470 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 12:50:01,476 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 12:50:01,482 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 12:50:01,483 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 12:50:01,485 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 12:50:01,485 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 12:50:01,486 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 12:50:01,488 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 12:50:01,488 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 12:50:01,491 - distributed.core - INFO - Con

Loading: LOCA2.UCSD.MIROC6.historical.day.d03


2025-08-05 12:51:50,426 - distributed.worker - ERROR - Worker stream died during communication: tcp://127.0.0.1:33695
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/comm/tcp.py", line 226, in read
    frames_nosplit_nbytes_bin = await stream.read_bytes(fmt_size)
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/worker.py", line 2073, in gather_dep
    response = await get_data_from_worker(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/worker.py", line 2879, in get_data_from_worker
    response = await send_recv(
               ^^^^^^^^^^^^^^^^
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distri

Processed MIROC6_historical_r4i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MIROC6.historical.day.d03


2025-08-05 12:53:56,435 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 12:53:56,441 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 12:53:56,464 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f5a1facfe90>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f5a1faae990>, 978605.067624851)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f5a1facfe00>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7fcc17ccaf60>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7fcc07da8950>, 978610.042464867)])']
connector: <aiohttp.connector.TCPConnector object at 0x7fcc17d34860>
2025-08-05 12:53:58,444 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been 

Processed MIROC6_historical_r5i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


2025-08-05 12:54:06,411 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 12:54:06,438 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MIROC6.ssp585.day.d03


2025-08-05 12:56:21,443 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 12:56:21,463 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7fc4ddc8bce0>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f6118ec04a0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f60f3a44b30>, 978753.202697185)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f611919e4e0>
2025-08-05 12:56:22,418 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f5b1cad6690>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f5afe836630>, 978754.430285486)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f5b212f1520>
2

Processed MIROC6_ssp585_r4i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


2025-08-05 12:56:37,492 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MIROC6.ssp585.day.d03


2025-08-05 12:58:53,444 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 12:58:53,447 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f5f4146a3c0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f5f3809a870>, 978900.189896073)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f5f4161bf80>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f5d22a03e60>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f5d12024dd0>, 978904.186870094)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f5d22703680>
2025-08-05 12:58:58,424 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f06434e2750>
U

Processed MIROC6_ssp585_r5i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


2025-08-05 12:59:03,429 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 12:59:03,431 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 12:59:03,432 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f9be69d53a0>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f10bf131df0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f10afa8b9b0>, 978908.899581192)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f10bd30c830>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f8328f37ce0>
  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MPI-ESM1-2-HR.historical.day.d03


2025-08-05 13:00:48,484 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f3c7ea109b0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f3c7412eb10>, 979020.258799372)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f3c7e902450>
2025-08-05 13:00:50,436 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:00:50,437 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:00:50,470 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:00:50,496 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7fd06b530ce0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler objec

Processed MPI-ESM1-2-HR_historical_r3i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f19cdab3bf0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f19c6b42570>, 979025.975879962)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f19cde168a0>
2025-08-05 13:01:02,411 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:01:02,521 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MPI-ESM1-2-HR.historical.day.d03


2025-08-05 13:02:49,450 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:02:49,452 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:02:49,464 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f749794fb60>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f74781dc0b0>, 979142.521782)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f74995aea20>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f0d8c809880>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f0d7a404230>, 979140.095964362)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f0d8d4af140>
2025-08-05 13:02:51,432 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been clo

Processed MPI-ESM1-2-HR_historical_r4i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f0b76569850>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f0b6c6e1a90>, 979141.663076398)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f0b6ff03ad0>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f0cd7ff17c0>
  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MPI-ESM1-2-HR.historical.day.d03


2025-08-05 13:04:41,496 - distributed.worker - ERROR - Worker stream died during communication: tcp://127.0.0.1:39897
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/comm/tcp.py", line 226, in read
    frames_nosplit_nbytes_bin = await stream.read_bytes(fmt_size)
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/worker.py", line 2073, in gather_dep
    response = await get_data_from_worker(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/worker.py", line 2879, in get_data_from_worker
    response = await send_recv(
               ^^^^^^^^^^^^^^^^
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distri

Processed MPI-ESM1-2-HR_historical_r5i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MPI-ESM1-2-HR.historical.day.d03


2025-08-05 13:06:46,447 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:06:46,448 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:06:46,451 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:06:46,453 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:06:46,463 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:06:46,473 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:06:46,478 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f6eb8c098e0>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7fe1618e3fb0>
Unclosed client session
client_session: <aiohttp.client.ClientSession obj

Processed MPI-ESM1-2-HR_historical_r6i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


2025-08-05 13:06:59,933 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MPI-ESM1-2-HR.historical.day.d03


2025-08-05 13:08:46,425 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f41ceb7a4b0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f41bff4c890>, 979498.219742892)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f41d265aa20>


Processed MPI-ESM1-2-HR_historical_r7i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


2025-08-05 13:08:51,473 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:08:51,479 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:08:51,479 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:08:51,482 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:08:51,484 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:08:51,498 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:08:51,526 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7fd584d13530>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7fd562dbf8f0>, 979497.21369432)])']
connector: <aiohttp.connector.TCPConnecto

Loading: LOCA2.UCSD.MPI-ESM1-2-HR.historical.day.d03


2025-08-05 13:10:39,508 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:10:39,512 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:10:39,531 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:10:39,561 - distributed.worker - ERROR - Worker stream died during communication: tcp://127.0.0.1:42453
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/comm/tcp.py", line 298, in write
    raise StreamClosedError()
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/worker.py", line 2073, in gather_dep
    response = await get_data_from_worker(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/srv/conda/envs/notebook/lib/py

Processed MPI-ESM1-2-HR_historical_r8i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


2025-08-05 13:10:46,413 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:10:46,419 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:10:46,432 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7ffb0301df70>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7fb992e33ec0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7fb98bb3cd10>, 979617.125719672)])']
connector: <aiohttp.connector.TCPConnector object at 0x7fb992f27a40>
  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MPI-ESM1-2-HR.historical.day.d03


2025-08-05 13:12:34,462 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:12:34,461 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:12:34,463 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:12:34,465 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:12:34,465 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:12:34,467 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:12:34,474 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:12:34,483 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:12:34,493 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client

Processed MPI-ESM1-2-HR_historical_r9i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MPI-ESM1-2-HR.historical.day.d03


2025-08-05 13:14:34,459 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:14:34,461 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:14:34,464 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:14:34,465 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:14:34,496 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7fd00e4560c0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7fd005743830>, 979849.038792574)])']
connector: <aiohttp.connector.TCPConnector object at 0x7fd010419e80>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f76b51add00>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler objec

Processed MPI-ESM1-2-HR_historical_r10i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


2025-08-05 13:14:47,429 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MPI-ESM1-2-HR.ssp370.day.d03


2025-08-05 13:17:05,465 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:17:05,469 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:17:05,469 - distributed.worker - ERROR - Failed to communicate with scheduler during heartbeat.
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/comm/tcp.py", line 226, in read
    frames_nosplit_nbytes_bin = await stream.read_bytes(fmt_size)
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/worker.py", line 1267, in heartbeat
    response = await retry_operation(
               ^^^^^^^^^^^^^^^^^^^^^^
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/

Processed MPI-ESM1-2-HR_ssp370_r3i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MPI-ESM1-2-HR.ssp370.day.d03


2025-08-05 13:19:35,531 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:19:35,533 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:19:35,533 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f527a1d18b0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f5270840530>, 980147.670705981)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f527a29c830>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f56cfd88b30>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f56bfe86570>, 980147.9415714)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f56cf67e8a0>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7faf399bab70>
202

Processed MPI-ESM1-2-HR_ssp370_r4i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MPI-ESM1-2-HR.ssp370.day.d03


2025-08-05 13:22:00,442 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:22:00,451 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:22:00,452 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:22:00,453 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:22:00,454 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:22:00,462 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:22:00,486 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:22:00,536 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f5f725f7830>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.

Processed MPI-ESM1-2-HR_ssp370_r5i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


2025-08-05 13:22:14,411 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MPI-ESM1-2-HR.ssp370.day.d03


2025-08-05 13:24:34,619 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:24:34,629 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f1b1fb885f0>
2025-08-05 13:24:39,413 - distributed.worker - ERROR - Failed to communicate with scheduler during heartbeat.
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/comm/tcp.py", line 226, in read
    frames_nosplit_nbytes_bin = await stream.read_bytes(fmt_size)
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/worker.py", line 1267, in heartbeat
    response = await retry_operation(
    

Processed MPI-ESM1-2-HR_ssp370_r6i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f8c858780b0>
  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MPI-ESM1-2-HR.ssp370.day.d03


2025-08-05 13:26:59,472 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:26:59,474 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:26:59,478 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:26:59,479 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:26:59,485 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:26:59,489 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:26:59,506 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f4b414c19d0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f4b3a6b81d0>, 980588.920296365)])']
connector: <aiohttp.connector.TCPConnect

Processed MPI-ESM1-2-HR_ssp370_r7i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MPI-ESM1-2-HR.ssp370.day.d03


2025-08-05 13:29:25,430 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f654e380b30>
2025-08-05 13:29:26,476 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:29:26,504 - distributed.worker - ERROR - Failed to communicate with scheduler during heartbeat.
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/comm/tcp.py", line 226, in read
    frames_nosplit_nbytes_bin = await stream.read_bytes(fmt_size)
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/worker.py", line 1267, in heartbeat
    response = await retry_operation(
    

Processed MPI-ESM1-2-HR_ssp370_r8i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MPI-ESM1-2-HR.ssp370.day.d03


2025-08-05 13:31:56,439 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7ff081c408c0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7ff0739dbf50>, 980890.09595909)])']
connector: <aiohttp.connector.TCPConnector object at 0x7ff081c40740>
2025-08-05 13:31:57,454 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:31:57,457 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:31:57,460 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:31:57,472 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:31:57,474 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:31:57,477 - distributed.core - INFO - Connection to tcp://127.0

Processed MPI-ESM1-2-HR_ssp370_r9i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MPI-ESM1-2-HR.ssp370.day.d03


2025-08-05 13:34:32,494 - distributed.worker - ERROR - Worker stream died during communication: tcp://127.0.0.1:43907
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/comm/tcp.py", line 231, in read
    buffer = await read_bytes_rw(stream, buffer_nbytes)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/comm/tcp.py", line 367, in read_bytes_rw
    actual = await stream.read_into(chunk)  # type: ignore[arg-type]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/worker.py", line 2073, in gather_dep
    response = await get_data_from_worker(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/srv/conda/envs/notebook/lib/python3.1

Processed MPI-ESM1-2-HR_ssp370_r10i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MRI-ESM2-0.historical.day.d03


2025-08-05 13:36:32,455 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:36:32,459 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:36:32,468 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:36:32,476 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f9c2ccb7950>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7faf7067aa20>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7faf68ca93d0>, 981163.890335395)])']
connector: <aiohttp.connector.TCPConnector object at 0x7faf74baa8a0>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7fbd8ca0e1b0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f

Processed MRI-ESM2-0_historical_r2i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


'zarr'
  external_backend_entrypoints = backends_dict_from_pkg(entrypoints_unique)
2025-08-05 13:36:45,729 - distributed.protocol.core - CRITICAL - Failed to deserialize
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/protocol/core.py", line 175, in loads
    return msgpack.loads(
           ^^^^^^^^^^^^^^
  File "msgpack/_unpacker.pyx", line 194, in msgpack._cmsgpack.unpackb
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/protocol/core.py", line 159, in _decode_default
    return merge_and_deserialize(
           ^^^^^^^^^^^^^^^^^^^^^^
  File "/srv/conda/envs/notebook/lib/python3.12/contextlib.py", line 81, in inner
    return func(*args, **kwds)
           ^^^^^^^^^^^^^^^^^^^
  File "/srv/conda/envs/notebook/lib/python3.12/site-packages/distributed/protocol/serialize.py", line 525, in merge_and_deserialize
    return deserialize(header, merged_frames, deserializers=deserializers)
           ^^^^^^^

Loading: LOCA2.UCSD.MRI-ESM2-0.historical.day.d03


2025-08-05 13:38:34,462 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:38:34,469 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:38:34,471 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f027fb9f110>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f026ed40230>, 981287.838975991)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f027fa72360>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f46c41ca630>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7fb430feba70>
2025-08-05 13:38:36,438 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:38:36,439 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has be

Processed MRI-ESM2-0_historical_r3i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


2025-08-05 13:38:46,411 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MRI-ESM2-0.historical.day.d03


2025-08-05 13:40:35,439 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f3fdf54ac00>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f3fdc14eab0>, 981409.130779645)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f3fe4ac8aa0>
2025-08-05 13:40:36,435 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:40:36,463 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:40:36,473 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:40:36,479 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7fded69c7e60>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler objec

Processed MRI-ESM2-0_historical_r4i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MRI-ESM2-0.historical.day.d03


2025-08-05 13:42:30,495 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f499be62f60>
2025-08-05 13:42:31,473 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:42:31,476 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:42:31,479 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:42:31,484 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:42:31,485 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7feee8a55280>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7feee0a50d70>, 981517.365992523)])']
connector: <aiohttp.connector.TCPConnector object

Processed MRI-ESM2-0_historical_r5i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MRI-ESM2-0.ssp370.day.d03


2025-08-05 13:44:59,558 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:44:59,658 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f05a3badfa0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f0592ce7b90>, 981669.511142718)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f05a5b65df0>
2025-08-05 13:45:01,436 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:45:01,451 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:45:01,461 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:45:01,477 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object

Processed MRI-ESM2-0_ssp370_r2i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MRI-ESM2-0.ssp370.day.d03


2025-08-05 13:47:24,358 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f929eec8980>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f9297588110>, 981814.92777037)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f92a4fd44d0>
2025-08-05 13:47:27,443 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:47:27,461 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:47:29,445 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:47:29,446 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:47:29,447 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:47:29,449 - distributed.core - INFO - Connection to tcp://127.0

Processed MRI-ESM2-0_ssp370_r3i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MRI-ESM2-0.ssp370.day.d03


2025-08-05 13:49:56,568 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:49:56,583 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:49:56,677 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7fde85e6c320>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7fde7c679070>, 981968.789173294)])']
connector: <aiohttp.connector.TCPConnector object at 0x7fde7fb046b0>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7efd60964b00>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7efd590e7b30>, 981966.59564872)])']
connector: <aiohttp.connector.TCPConnector object at 0x7efd62938c50>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f33fe2f6990>
20

Processed MRI-ESM2-0_ssp370_r4i1p1f1_...

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


  self.index_grouper = pd.Grouper(
  self.index_grouper = pd.Grouper(


Loading: LOCA2.UCSD.MRI-ESM2-0.ssp370.day.d03


2025-08-05 13:52:21,431 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f383092c320>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f381fcba990>, 982112.487675132)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f3834355a60>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f6c7f5359a0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x7f6c6fa77590>, 982108.217174798)])']
connector: <aiohttp.connector.TCPConnector object at 0x7f6c7f6041a0>
2025-08-05 13:52:22,441 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:52:22,442 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been closed.
2025-08-05 13:52:22,444 - distributed.core - INFO - Connection to tcp://127.0.0.1:45569 has been 

Processed MRI-ESM2-0_ssp370_r5i1p1f1_...


In [18]:
#Write to ouput.
zip_path = os.path.join(output_folder, file_zip)

dict_rolling_flow_weighted = get_monthly_rolling_ave(flow_weighted_results_dict,30,'30yrAve')
dict_rolling_area_weighted = get_monthly_rolling_ave(area_weighted_basin_results_dict,30,'30yrAve')

with zipfile.ZipFile(zip_path, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
    for k, v in results_dict.items():
        text_stream = StringIO();
        v.to_csv(text_stream, index=False)
        fileout = dir_individual + '/' + k
        zf.writestr(fileout, text_stream.getvalue())

    for k, v in flow_weighted_results_dict.items():
        fileout = dir_flow_weighted + '/' + k
        fileout_raw = dir_flow_weighted + '/Raw/' + k
        text_stream = StringIO();
        v.to_csv(text_stream, index=False)
        zf.writestr(fileout_raw, text_stream.getvalue())
        v.drop('time' , axis=1, inplace=True)
        text_stream = StringIO();
        v.to_csv(text_stream, index=False)
        zf.writestr(fileout, text_stream.getvalue())

    for k, v in area_weighted_basin_results_dict.items():
        fileout = dir_area_weighted + '/' + k
        fileout_raw = dir_area_weighted + '/Raw/' + k
        text_stream = StringIO();
        v.to_csv(text_stream, index=False)
        zf.writestr(fileout_raw, text_stream.getvalue())
        v.drop('time' , axis=1, inplace=True)
        text_stream = StringIO();
        v.to_csv(text_stream, index=False)
        zf.writestr(fileout, text_stream.getvalue())

    for k, v in dict_rolling_area_weighted.items():
        fileout = dir_area_weighted_rolling + '/' + k
        text_stream = StringIO();
        v.to_csv(text_stream, index=False)
        zf.writestr(fileout, text_stream.getvalue())
        
    for k, v in dict_rolling_flow_weighted.items():
        fileout = dir_flow_weighted_rolling + '/' + k
        text_stream = StringIO();
        v.to_csv(text_stream, index=False)
        zf.writestr(fileout, text_stream.getvalue())
    

In [19]:
client.close()