In [1]:
import pandas as pd
import numpy as np
import xarray as xr

In [67]:
def get_redata_time_avg(dataset, filepath, variable, subset=[-120.0,30.0,-80.0,20.0], resolution='M'):
    '''
    
    Function to read reanalysis data from MERRA-2 or ERA5 and average in time if necessary. Data
    are read for a specified lat-lon subset. 
    
    Processing steps are applied as necessary depending on the variable being read. See below.
    
    Missing values are accounted for as needed for all variables. Either the given _FillValue from the 
    input files is used to locate these values and set them to NaN, or an arbitrary value
    is used. In the latter case, the _FillValue is not used due to NetCDF scaling issues. However, 
    both methods should ensure that the ocean is masked (set to NaN values) for land variables.
    
    For ERA5 soil moisture data (variable name 'swvl'):
        A weighted mean of soil moisture values for each ERA5 layer is computed in order to obtain the root-zone mean. 
        
    For MERRA-2 data, this function expects daily data as input and resampling to monthly values
    is performed. For ERA5 data, this function expects monthly data and resampling 
    is not necessary.
    
    Parameters:
    -dataset: String. Possible values 'MERRA-2' or 'ERA5'. Reanalysis dataset to be read. 
    -filepath: String. Full path to file to be read. 
    -variable: String. MERRA-2 or ERA5 variable name.
    -resolution: String. Default is 'M' for monthly and currently only this resolution is supported. 
                Refers to temporal resolution for output.
    -subset: List of floats ([lon1, lon2, lat1, lat2]). Desired spatial domain defined by lats/lons 
                for which data will be extracted.
    
    Returns:
    An xarray DataArray containing the desired data.
    
    Author: Carolina Bieri (bieri2@illinois.edu) 
    '''
    
    # Tell user which file is being read
    print('Reading from: \n' + filepath)
    
    # Open file 
    ds = xr.open_dataset(filepath)
    
    # Do this if MERRA-2 data are desired:
    if dataset == 'MERRA-2':
        # Read in desired variable and subset by lats and lons 
        ds_var = ds[variable].sel(lat = slice(subset[2],subset[3]), lon = slice(subset[0],subset[1]))
        # Set missing values to NaN
        ds_var = ds_var.where(ds_var != ds_var.encoding['_FillValue'])
        
    # Do this if ERA5 soil moisture data are desired: 
    elif (dataset == 'ERA5') & (variable == 'swvl'):
        # Define thicknesses of ERA5 soil layers
        thick  = np.array([0.07,0.21,0.72])
        # Read in soil moisture data for first three layers and subset by lats and lons
        ds_var = ds[['swvl1','swvl2','swvl3']].sel(latitude = slice(subset[3],subset[2]), longitude = slice(subset[0],subset[1]))
        # Calculate weighted mean using first three layers of soil moisture data 
        ds_wgt = (ds_var['swvl1']*thick[0])+(ds_var['swvl2']*thick[1])+(ds_var['swvl3']*thick[2])/3.        
        # Exclude invalid values (set to NaN)
        ds_wgt = ds_wgt.where(ds_wgt > 0.001)
        
    # Do this if any other ERA5 variable is desired:
    else:
        # Read in desired variable from dataset and subset by lats and lons
        ds_var = ds[variable].sel(latitude = slice(subset[3],subset[2]), longitude = slice(subset[0],subset[1]))
        
        #calc_fillvalue = (ds_var.encoding['_FillValue']*ds_var.encoding['scale_factor'])+ds_var.encoding['add_offset']
        #ds_var = ds_var.where(ds_var != calc_fillvalue)

    # Resample the time dimension if monthly values are desired
    # Not necessary for ERA5 data, since we are using the product that is already in monthly means
    if (dataset == 'MERRA-2') & (resolution == 'M'):
        ds_var = ds_var.resample(time=resolution).mean(dim='time')
    
    # Return data
    if (dataset == 'ERA5') & (variable == 'swvl'):
        return ds_wgt
    else:
        return ds_var