In [1]:
import xarray as xr
import numpy as np
from distributed import Client
import logging
import flox  # make sure its available
import flox.xarray
from tqdm.contrib.itertools import product
from tqdm.notebook import tqdm
# client = Client(n_workers=20, silence_logs=logging.ERROR)
# client

Load in the ERA5 ARCO dataset with reasonable chunks for the VM

In [3]:
era5 = xr.open_zarr(
    'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3',
    chunks=None,
    storage_options=dict(token='anon'),
)

Subset down the 30 year dataset and only select 6 hourly intervals

In [4]:
# Select the time range
era5_30yr = era5.sel(time=slice('1989-12-22', '2020-01-10'))
era5_30yr = era5_30yr.sel(time=era5_30yr.time.dt.hour.isin([0, 6, 12, 18]))

Create linearly decaying weight DataArray 

In [5]:
# Define the rolling window parameters
half_window_days = 10
percentile = 15

weights = np.linspace(0,1, half_window_days + 1)
weights = np.concatenate([weights, weights[::-1][1:]])
weights /= weights.sum()
weight_da = xr.DataArray(weights, dims=['window'])

Split each dataset by year and save to an interim netcdf

In [None]:
for year in tqdm(range(1990, 2020)):
    # Get data for this year +/- 10 days
    start_date = f"{year-1}-12-22"  # 10 days before the start of the year
    end_date = f"{year+1}-01-10"    # 10 days after the end of the year
    era5_1yr = era5.sel(time=slice(start_date, end_date))
    era5_1yr = era5_1yr.sel(time=era5_1yr.time.dt.hour.isin([0, 6, 12, 18]))
    # Slice the data for the current year window
    year_data = era5_1yr['2m_temperature'].compute()
    
    # Create the rolling window with weights for this year's data
    hour_group = []
    for hour in [0, 6, 12, 18]:
        year_rolling = year_data.sel(time=year_data.time.dt.hour==hour).rolling(time=len(weights), center=True)
        year_rolling_weighted = year_rolling.construct('window', sliding_window_view_kwargs={"automatic_rechunk": True}).dot(weight_da)
        year_rolling_weighted = year_rolling_weighted.compute()
        hour_group.append(year_rolling_weighted)

        
    hour_grouped_weighted_ds = xr.concat(hour_group,dim='time')
    hour_grouped_weighted_ds.to_netcdf(f"/home/taylor/data/hour_era5_30yr_rolling_2m_temperature_weighted_{year}.nc")
    

  0%|          | 0/30 [00:00<?, ?it/s]

: 

In [12]:
percentile = .15
climatology_list=[]
for doy, hour in product(range(1, 367), [0,6,12,18]):
    single_year_list = []
    for year in range(1990, 2020):
        rolling_weights_ds = xr.open_dataset(f'/home/taylor/data/hour_era5_30yr_rolling_2m_temperature_weighted_{year}.nc')
        rolling_weights_ds_hour = rolling_weights_ds.sel(time=rolling_weights_ds.time.dt.hour == hour)
        rolling_weights_ds_hour_doy = rolling_weights_ds_hour.sel(time=rolling_weights_ds_hour.time.dt.dayofyear == doy)
        rolling_weights_ds_hour_doy_year = rolling_weights_ds_hour_doy.sel(time=rolling_weights_ds_hour_doy.time.dt.year == year)
        rolling_weights_ds_hour_doy_year['2m_temperature'] = rolling_weights_ds_hour_doy_year['__xarray_dataarray_variable__']
        data_array = rolling_weights_ds_hour_doy_year['2m_temperature']
        single_year_list.append(data_array)
        del rolling_weights_ds_hour_doy_year, data_array, rolling_weights_ds_hour_doy, rolling_weights_ds_hour, rolling_weights_ds
    single_year_list = [n for n in single_year_list if len(n.time) !=0] # remove any years with no data, such as w/ day 366
    output = xr.concat(single_year_list, dim='time')
    output_quantile = output.groupby(["time.dayofyear", "time.hour"]).quantile(percentile)
    climatology_list.append(output_quantile)
    del output, output_quantile

climatology = xr.combine_by_coords(climatology_list)



  0%|          | 0/1464 [00:00<?, ?it/s]

In [13]:
climatology.to_netcdf('era5_freeze_climatology.nc')