# AWS ERA5 data

- remaking ERA5 from AWS zarr store


In [None]:
#notebook based on zflemings:https://nbviewer.jupyter.org/github/zflamig/dask-era5/blob/main/notebook/era5_fargate_dask.ipynb
import xarray as xr
import fsspec
import dask
import s3fs
import os

import numpy as np
xr.set_options(display_style="html")  #display dataset nicely 

- open up cluster in dashboard and connect directly once you know IP
- not sure why but gateway didn't work for this hub

In [None]:
#from dask_gateway import Gateway
#from dask.distributed import Client
#gateway = Gateway()
#cluster = gateway.new_cluster()
#cluster.adapt(minimum=1, maximum=70)
#cluster
from dask.distributed import Client
client = Client("tcp://127.0.0.1:34163")

In [None]:
def fix_accum_var_dims(ds, var):
    # Some varibles like precip have extra time bounds varibles, we drop them here to allow merging with other variables
    
    # Select variable of interest (drops dims that are not linked to current variable)
    ds = ds[[var]]  

    if var in ['air_temperature_at_2_metres',
               'dew_point_temperature_at_2_metres',
               'air_pressure_at_mean_sea_level',
               'northward_wind_at_10_metres',
               'eastward_wind_at_10_metres',
               'eastward_wind_at_100_metres',
               'northward_wind_at_100_metres',
              'lwe_thickness_of_surface_snow_amount',
              'sea_surface_temperature',
              'surface_air_pressure',
              'snow_density']:
        
        ds = ds.rename({'time0':'time','lat':'latitude','lon':'longitude'})
        
    elif var in ['precipitation_amount_1hour_Accumulation',
                 'integral_wrt_time_of_surface_direct_downwelling_shortwave_flux_in_air_1hour_Accumulation',
                 'air_temperature_at_2_metres_1hour_Maximum',
                 'air_temperature_at_2_metres_1hour_Minimum']:
        
        ds = ds.rename({'time1':'time','lat':'latitude','lon':'longitude'})
        
    else:
        print("Warning, Haven't seen {var} varible yet! Time renaming might not work.".format(var=var))
        
    return ds

@dask.delayed
def s3open(path):
    fs = s3fs.S3FileSystem(anon=True, default_fill_cache=False, 
                           config_kwargs = {'max_pool_connections': 20})
    return s3fs.S3Map(path, s3=fs)


def open_era5_range(start_year, end_year, variables):
    ''' Opens ERA5 monthly Zarr files in S3, given a start and end year (all months loaded) and a list of variables'''
    
    
    file_pattern = 'era5-pds/zarr/{year}/{month}/data/{var}.zarr/'
    
    years = list(np.arange(start_year, end_year+1, 1))
    months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
    
    l = []
    for var in variables:
        print('opening',var)
        
        # Get files
        files_mapper = [s3open(file_pattern.format(year=year, month=month, var=var)) for year in years for month in months]
        
        # Look up correct time dimension by variable name
        
        if var in ['precipitation_amount_1hour_Accumulation',
                 'integral_wrt_time_of_surface_direct_downwelling_shortwave_flux_in_air_1hour_Accumulation',
                 'air_temperature_at_2_metres_1hour_Maximum',
                 'air_temperature_at_2_metres_1hour_Minimum']:
            concat_dim='time1'
        else:
            concat_dim='time0'
            
        # Lazy load
        ds = xr.open_mfdataset(files_mapper, engine='zarr', 
                               concat_dim=concat_dim, combine='nested', 
                               coords='minimal', compat='override', parallel=True)
        
        # Fix dimension names
        ds = fix_accum_var_dims(ds, var)
        l.append(ds)
        
    ds_out = xr.merge(l)
    
    return ds_out


In [None]:
# set AWS region to access ERA5 data
s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name':'us-east-1'})
data_var = s3.ls('era5-pds/zarr/2021/04/data/')

In [None]:
ddvar = []
for dvar in data_var:
    print(dvar[27:-5])
    ddvar.append(dvar[27:-5])

- it took me a while to get this to run
- the cluster kept crashing until I just continued breaking it down into smaller and smaller bits
- subseting to year/month was finally the trick with ~30 workers and 200GB memory being used to find the mean for each month

In [None]:
%%time

@dask.delayed
def downsample(ds):
    ds_month  = ds.resample(time='1M').mean(keep_attrs=True)
    ds_month_deg = ds_month.coarsen(latitude=8,longitude=8,boundary="trim").mean(keep_attrs=True)
    return ds_month_deg

for idvar,dvar in enumerate(data_var):
    tem = []
    var = dvar[27:-5]
    fout = './../../data/era5/era5_monthly_2deg'+var+'.nc'
    if os.path.exists(fout):
        continue
    for lyr in range(1979,2021):
        #print(lyr,dvar)        
        #if var == 'air_pressure_at_mean_sea_level':
        #    continue
        print(lyr,var)
        ds = open_era5_range(lyr,lyr, [var])
        mn = []
        tt= []
        for i in range(12):
            ds_month  = ds.sel(time=str(lyr)+'-'+str(i+1).zfill(2)).mean('time',keep_attrs=True)
            ds_month = ds_month.assign_coords({'time':ds.sel(time=str(lyr)+'-'+str(i+1).zfill(2)).time.mean().data})
            ds_month_deg = ds_month.coarsen(latitude=8,longitude=8,boundary="trim").mean(keep_attrs=True)
            ds_month_deg = ds_month_deg.load()
            mn.append(ds_month_deg)  
        mn = xr.concat(mn,dim='time')
        tem.append(mn)
        tem2 = xr.concat(tem,dim='time')
        tem2 = tem2.sortby(tem2.latitude)
        #tem2.to_zarr('./../../data/era5/era5_monthly_2deg'+var+'_1990.zarr')
        tem2.to_netcdf('./../../data/era5/era5_monthly_2deg'+var+'.nc')
        print('wrote:', lyr)

In [None]:
for idvar,dvar in enumerate(data_var):
    tem = []
    var = dvar[27:-5]
    fout = './../../data/era5/era5_monthly_2deg'+var+'.nc'
    if os.path.exists(fout):
        ds = xr.open_dataset('./../../data/era5/era5_monthly_2deg'+var+'.nc')
        print(ds.time[-1].data,idvar,var,ds.time[0].data)

- now take all the individual files & merge together

In [None]:
ds_all = []
for idvar,dvar in enumerate(data_var):
    var = dvar[27:-5]
    file = './../../data/era5/era5_monthly_2deg'+var+'.nc'
    if os.path.exists(fout):
        ds = xr.open_dataset(file)
        if idvar==0:
            ds_all = ds
        else:
            for var in ds:
                ds_all[var]=ds[var]
        
ds_all.to_netcdf('./../../data/era5/era5_monthly_2deg.nc')
ds_all

In [None]:
print('ds size in GB {:0.2f}\n'.format(ds_all.nbytes / 1e9))
#ds_all.info

In [None]:
import matplotlib.pyplot as plt
for var in ds_all:
    ds_all[var][-1,:,:].plot()
    plt.show()
    input()

In [None]:
import matplotlib.pyplot as plt
for var in ds_all:
    ds_all[var].mean({'latitude','longitude'}).plot()
    plt.show()
    input()
    
    

In [None]:
ds_all.air_temperature_at_2_metres.mean({'latitude','longitude'}).plot()