In [1]:
import xarray as xr
import numpy as np
from google.cloud import storage
import fsspec
from kerchunk.hdf import SingleHdf5ToZarr 
from kerchunk.combine import MultiZarrToZarr
import ujson
from dask.distributed import Client, LocalCluster, progress
import matplotlib.pyplot as plt
import pandas as pd
import dask
dask.config.set({'temporary_directory': '/mnt/disks/data/'})

<dask.config.set at 0x7f6f64a7b990>

In [2]:
fs_read = fsspec.filesystem('gcs', anon=False, skip_instance_cache=True)
fs_local = fsspec.filesystem('')  
json_dir = 'assets/json/'
json_list = fs_local.glob(str(json_dir)+'PANG*_.json')
so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first')

def convert_longitude_to_360(longitude):
    return longitude % 360

def generate_json_from_grap_nc(u,fs, fs_out):
    with fs.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)

        file_split = u.split('/') # seperate file path to create a unique name for each json 
        model = file_split[1].split('_')[0]
        date_string = file_split[-1].split('_')[3]
        outf = f'{json_dir}{model}_{date_string}_.json'
        print(outf)
        with fs_out.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());

# seasonal aggregation functions for max, min, and mean
def seasonal_subset_max(df):
    df = df.where(df.index.month.isin([6,7,8]))
    return df.max()

def seasonal_subset_min(df):
    df = df.where(df.index.month.isin([6,7,8]))
    return df.min()

def seasonal_subset_mean(df):
    df = df.where(df.index.month.isin([6,7,8]))
    return df.mean()

def is_jja(month):
    return (month >= 6) & (month <= 8)

def is_6_hourly(hour):
    return (hour == 0) | (hour == 6) | (hour == 12) | (hour == 18)


In [3]:
cluster = LocalCluster(
    n_workers=8,
    threads_per_worker=2,
    memory_limit='6GiB',
)
client = Client(cluster)

# KSEA Obs Generation

In [None]:
ksea_event_df = pd.read_csv(f'assets/data/KSEA_event.csv').dropna()
ksea_event_df['valid'] = pd.to_datetime(ksea_event_df['valid'])
ksea_event_df['tmpc'] = ksea_event_df['tmpc'].astype(float)
ksea_event_df = ksea_event_df.set_index('valid')
ksea_event_df = ksea_event_df.resample('h').mean(numeric_only=True)
ksea_event_df = ksea_event_df[['tmpc']]

# KSEA and Other Obs Climatology

In [None]:
point_obs_dict = {}
point_obs_dict['KSEA'] = {}
point_obs_dict['KBLI'] = {}
point_obs_dict['CYVR'] = {}
point_obs_dict['CWLY'] = {}

for station in point_obs_dict.items():
    station[1]['data'] = pd.read_csv(f'assets/data/{station[0]}.csv').dropna()
    station[1]['data']['valid'] = pd.to_datetime(station[1]['data']['valid'])
    station[1]['data']['tmpc'] = station[1]['data']['tmpc'].astype(float)
    station[1]['data'] = station[1]['data'].set_index('valid')
    station[1]['data'] = station[1]['data'].resample('h').mean(numeric_only=True)
    subset_summer_df = station[1]['data']['tmpc'][station[1]['data']['tmpc'].index.month.isin([6,7,8])]
    station[1]['max_temp_85th_percentile'] = np.nanpercentile(subset_summer_df.resample('D').max(),85)
    station[1]['min_temp_85th_percentile'] = np.nanpercentile(subset_summer_df.resample('D').min(),85)
    station[1]['mean_temp_85th_percentile'] = np.nanpercentile(subset_summer_df.resample('D').mean(),85)
    station[1]['temp_85th_percentile'] = np.nanpercentile(subset_summer_df,85)

# Load, Subset ERA5 ARCO

In [4]:
era5 = xr.open_zarr(
    "gs://gcp-public-data-arco-era5/ar/1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2",
    chunks='auto',
    storage_options=dict(token='anon'),
)

In [None]:
# subset 1989-2019 time frame. 2020 onwards is training data (avoid)
era5_climatology = era5[['2m_temperature']].sel(time=slice('1989','2019'))

# subset JJA and 00, 06, 12, 18z for summer/heat wave criteria to match model outputs
era5_subset_jja = era5_climatology.sel(time=is_jja(era5_climatology['time.month']))
era5_subset_hours = era5_subset_jja.sel(time=is_6_hourly(era5_subset_jja['time.hour']))

# this will take a few minutes, save the subset dataset to a zarr store
era5_subset_hours.to_zarr('/home/taylor/data/era5_subset_hours.zarr', mode='w')

In [None]:
era5_subset = xr.open_zarr(
    '/home/taylor/data/era5_subset_hours.zarr',
    chunks='auto',
)

In [8]:
era5_6hourly_climatology = era5_subset.groupby('time.hour').mean()

# this will take a few minutes as well, save the subset dataset to a zarr store
era5_6hourly_climatology.to_zarr('/home/taylor/data/era5_2m_temperature_mean_6hourly_1989-2019.zarr', mode='w')

<xarray.backends.zarr.ZarrStore at 0x7f6ecf658040>

In [None]:
#Get the 85th percentile of the JJA 00, 06, 12, 18z data globally for 1989-2019
era5_85th_percentile = era5_subset.quantile(0.85,dim='time')

# this will take a few minutes as well, save the subset dataset to a zarr store
era5_85th_percentile.to_zarr('/home/taylor/data/era5_2m_temperature_85th_percentile_1989-2019.zarr', mode='w')