# AWS ERA5 data

- remaking ERA5 from AWS zarr store


In [None]:
#notebook based on zflemings:https://nbviewer.jupyter.org/github/zflamig/dask-era5/blob/main/notebook/era5_fargate_dask.ipynb
import xarray as xr
import fsspec
import dask
import s3fs
import numpy as np
xr.set_options(display_style="html")  #display dataset nicely 


In [None]:
from dask_gateway import Gateway
from dask.distributed import Client

gateway = Gateway()
cluster = gateway.new_cluster()
cluster.adapt(minimum=1, maximum=70)
cluster

In [None]:
def fix_accum_var_dims(ds, var):
    # Some varibles like precip have extra time bounds varibles, we drop them here to allow merging with other variables
    
    # Select variable of interest (drops dims that are not linked to current variable)
    ds = ds[[var]]  

    if var in ['air_temperature_at_2_metres',
               'dew_point_temperature_at_2_metres',
               'air_pressure_at_mean_sea_level',
               'northward_wind_at_10_metres',
               'eastward_wind_at_10_metres']:
        
        ds = ds.rename({'time0':'time','lat':'latitude','lon':'longitude'})
        
    elif var in ['precipitation_amount_1hour_Accumulation',
                 'integral_wrt_time_of_surface_direct_downwelling_shortwave_flux_in_air_1hour_Accumulation']:
        
        ds = ds.rename({'time1':'time','lat':'latitude','lon':'longitude'})
        
    else:
        print("Warning, Haven't seen {var} varible yet! Time renaming might not work.".format(var=var))
        
    return ds

@dask.delayed
def s3open(path):
    fs = s3fs.S3FileSystem(anon=True, default_fill_cache=False, 
                           config_kwargs = {'max_pool_connections': 20})
    return s3fs.S3Map(path, s3=fs)


def open_era5_range(start_year, end_year, variables):
    ''' Opens ERA5 monthly Zarr files in S3, given a start and end year (all months loaded) and a list of variables'''
    
    
    file_pattern = 'era5-pds/zarr/{year}/{month}/data/{var}.zarr/'
    
    years = list(np.arange(start_year, end_year+1, 1))
    months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
    
    l = []
    for var in variables:
        print(var)
        
        # Get files
        files_mapper = [s3open(file_pattern.format(year=year, month=month, var=var)) for year in years for month in months]
        
        # Look up correct time dimension by variable name
        if var in ['precipitation_amount_1hour_Accumulation']:
            concat_dim='time1'
        else:
            concat_dim='time0'
            
        # Lazy load
        ds = xr.open_mfdataset(files_mapper, engine='zarr', 
                               concat_dim=concat_dim, combine='nested', 
                               coords='minimal', compat='override', parallel=True)
        
        # Fix dimension names
        ds = fix_accum_var_dims(ds, var)
        l.append(ds)
        
    ds_out = xr.merge(l)
    
    return ds_out


In [None]:
# set AWS region to access ERA5 data
s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name':'us-east-1'})
data_var = s3.ls('era5-pds/zarr/2021/04/data/')
for dvar in data_var:
    print(dvar[27:-5])

In [None]:
%%time
for dvar in data_var:
    var = dvar[27:-5]
    ds = open_era5_range(1979, 2020, [var])
    #resample to 1 Month
    ds_month  = ds.resample(time='1M').mean(keep_attrs=True)
    #resample spatially to 1 deg
    ds_month_deg = ds_month.coarsen(latitude=4,longitude=4,boundary="trim").mean(keep_attrs=True)
    ds_month_deg.to_netcdf('./../../data/era5/era5_monthly_1deg'+var+'.nc')

In [None]:
print('ds size in GB {:0.2f}\n'.format(ds.nbytes / 1e9))
ds.info

In [None]:
# set AWS region to access ERA5 data
s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name':'us-east-1'})
data_var = s3.ls('era5-pds/zarr/2021/04/data/')
data_var[0][27:-5]