#### This notebook takes raw ERA5 data and performs the following operations using Dask: 
1. Concatenate the disjointed ERA5 files over some region 
2. Saves the concatenated data 
3. Uses concatenated data to compute a daily climatology 


In [2]:
import os, gc; from os.path import exists
os.chdir('/vortexfs1/home/anthony.meza/CTWPC/scripts')
main_dir = "/vortexfs1/home/anthony.meza/CTWPC"
plotsdir = lambda x="": main_dir + "/plots/" + x
GLORYS_dir = lambda x="": main_dir + "/GLORYS_data" + x
GLORYS_data_dir = lambda x="": main_dir + "/GLORYS_processed/" + x
ERA5_data_dir = lambda x="": main_dir + "/ERA5_data/" + x

In [9]:
from help_funcs import * 
import xarray as xr
import pandas as pd
import netCDF4 as nc
from pathlib import Path
from natsort import natsorted
import matplotlib.pyplot as plt
import dask_labextension

In [None]:
from dask_jobqueue import SLURMCluster  # setup dask cluster 
cluster = SLURMCluster(
    cores=36,
    processes=1,
    memory='192GB',
    walltime='02:00:00',
    queue='compute',
    interface='ib0')
print(cluster.job_script())
cluster.scale(jobs=4)
from dask.distributed import Client
client = Client(cluster)
client

In [5]:
results_tcvw = natsorted(str(result) for result in list(Path("/vortexfs1/share/cdx/hseo/Data/ERA5/NEP/tcwv").rglob("*.[nN][cC]")))
results_tp = natsorted([str(result) for result in list(Path("/vortexfs1/share/cdx/hseo/Data/ERA5/NEP/tp").rglob("*.[nN][cC]"))])
results_z500 = natsorted([str(result) for result in list(Path("/vortexfs1/share/cdx/hseo/Data/ERA5/NEP/z500").rglob("*.[nN][cC]"))])

In [6]:
def _preprocess_ERA5(ds):
    return ds.sel(latitude = slice(60, -2)).sel(longitude = slice(-150, -75)).resample(time="1D").mean().sel(time = slice("1993", None))

open_mfdataset = lambda x: xr.open_mfdataset(x,concat_dim="time", combine="nested", 
        data_vars="minimal", coords="minimal",
        compat="override", preprocess=_preprocess_ERA5,
        parallel=True, chunks={"latitude":-1, "longitude":-1, "time":-1}, engine="netcdf4")

ds_tcwv = open_mfdataset(results_tcvw)
ds_tp = open_mfdataset(results_tp)
ds_z500 = open_mfdataset(results_z500)
ds_tp = (24e3) * ds_tp #m/hour -> mm/ day 

#resample hourly data to daily
#Total precipiation requires of a change of units
era5_daily = xr.merge([ds_tcwv, ds_tp, ds_z500])

#save the dataset before processing 
era5_daily.to_netcdf(GLORYS_data_dir("ERA5_NE_PAC_daily.nc"),
             mode = "w", format = "NETCDF4", 
             engine = "netcdf4", compute = True)

In [7]:
#reopen daily fields that have been saved in a nicer format and save the anomalies
gc.collect()
era5_daily = xr.open_mfdataset(GLORYS_data_dir("ERA5_NE_PAC_daily.nc"), 
                                data_vars="minimal",
                                coords="minimal",
                                compat="override",
                                parallel=True,
                                chunks={"longitude": -1, "latitude":-1, "time":100},
                                engine="netcdf4")

In [8]:
era5_daily = era5_daily.convert_calendar('noleap') #remove leap years from operations
era5_climatology = era5_daily.groupby("time.dayofyear").mean("time")
era5_climatology.to_netcdf(ERA5_data_dir("ERA5_Daily_Climatology.nc"),
             mode = "w", format = "NETCDF4", 
             engine = "netcdf4", compute = True)

In [21]:
era5_climatology = xr.open_dataset(ERA5_data_dir("ERA5_Daily_Climatology.nc"))
era5_climatology.to_netcdf(ERA5_data_dir("ERA5_Daily_SmoothClimatology.nc"),
             mode = "w", format = "NETCDF4", 
             engine = "netcdf4", compute = True)