# Batch analysis for sampling any 2d or 3d field as a function of cloud controlling factors

Note: we use the numpy implementation of digitize, which has proven easy to use and sufficiently fast.

Aiko Voigt, KIT, 15 Oct 2020

Preamble

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr
import zarr
import shutil

In [2]:
import dask
dask.config.set({"array.slicing.split_large_chunks": True})

from dask.distributed import Client
client = Client()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 39057 instead


In [3]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:41506  Dashboard: http://127.0.0.1:39057/status,Cluster  Workers: 8  Cores: 48  Memory: 134.22 GB


In [4]:
import sys
sys.path.append('/pf/b/b380459/nawdex-hackathon/')

import dict_nawdexsims
simdict = dict_nawdexsims.simdictionary()

import nawdexutils as nawut

### Ancillary functions for loading data

In [5]:
def load_griddata(gridres):
    path  = '/scratch/b/b380459/icon_4_hackathon/'
    dict_gridres={'80km': 'R80000m', '40km': 'R40000m', '20km': 'R20000m',
                  '10km': 'R10000m', '5km': 'R5000m', '2km': 'R2500m'}
    fname = path+'/grids/icon-grid_nawdex_78w40e23n80n_'+dict_gridres[gridres]+'.nc'
    return xr.open_dataset(fname)['cell_area'].rename({'cell': 'ncells'}) 

def load_openoceanmask(expid):
    path  = '/scratch/b/b380459/icon_4_hackathon/'
    fname = path+'/openocean_masks/'+expid+'_openoceanmask.nc'
    return xr.open_dataset(fname)['mask_openocean']  

def load_ccf(expid, ccf):
    path  = '/scratch/b/b380459/icon_4_hackathon/'
    dict_stream = {'omega': 'rh_omega_DOM01_PL', 't_g': '2d_30min_DOM01_ML'}
    fname = path+'/'+expid+'/'+expid+'_2016*_'+dict_stream[ccf]+'_0*.nc'
    ds = ( xr.open_mfdataset(fname,combine='by_coords',parallel=True, 
                             engine='h5netcdf', chunks={'time': 1})
           [ccf].resample(time="1H").nearest(tolerance="5M") )
    if ccf == 'omega':
        ds = ds.sel(lev=500e2)
    return ds

def load_var(expid, var):
    path  = '/scratch/b/b380459/icon_4_hackathon/'
    dict_stream = {'clct': '2d_30min', 'clc': '3dcloud'}
    fname = path+'/'+expid+'/'+expid+'_2016*_'+dict_stream[var]+'_DOM01_ML_0*.nc'
    return ( xr.open_mfdataset(fname,
                               combine='by_coords',parallel=True, 
                               engine='h5netcdf', chunks={'time': 1})
             [var].resample(time="1H").nearest(tolerance="5M") )

def load_var_ddt_temp_rad_fromflux(expid, var):
    path  = '/scratch/b/b380459/icon_4_hackathon/'
    # open previously calculated radiative heating rates from zarr store
    zarr_store = '/scratch/b/b380459/icon_4_hackathon/'+expid+'/'+expid+'_ddttemp_rad-from-fluxes_DOM01_ML.zarr'
    return ( xr.open_zarr(zarr_store)
             [var].resample(time="1H").nearest(tolerance="5M") )

# Load cloud controlling factor, which is a 2d field (time x ncells)
def prepare_ccf_dataset(expid, resolution, ccf):
    
    # open ocean mask
    da_oom = load_openoceanmask(expid)
    index  = np.where(da_oom==1)[0]
    
    # load data and only keep cells over open ocean
    ds_grid = load_griddata(resolution).isel(ncells=index)
    ds_ccf  = load_ccf(expid, ccf).isel(ncells=index)
    
    return xr.merge([ds_grid, ds_ccf])

# Load variable that will be sampled on cloud controlling factor, this
# can be a 2d field (time x ncells) or a 3d field (time x height x ncells), 
# and it can also be the radiative heating rates provided in zarr stores
def prepare_var_dataset(expid, resolution, var):
    # open ocean mask
    da_oom = load_openoceanmask(expid)
    index  = np.where(da_oom==1)[0]
    
    # load data and only keep cells over open ocean
    ds_grid = load_griddata(resolution).isel(ncells=index)
    # special treament for diagnosed rad heating rates stored in zarr stores
    if var in ['ddt_temp_radlw_fromflux', 'ddt_temp_radlwclr_fromflux',
               'ddt_temp_radsw_fromflux', 'ddt_temp_radswclr_fromflux']:
        ds_var  = load_var_ddt_temp_rad_fromflux(expid, var).isel(ncells=index)
    else:
        ds_var  = load_var(expid, var).isel(ncells=index)
    
    
    return xr.merge([ds_grid, ds_var])

### Ancillary functions for indexing ccf data and for sampling variable on ccf

Function to index grid cells according to which bin in the ccf they belong to

In [6]:
def make_ccf_binindexing(ds_in, nbins, binrange):
    # input: dataset ds_in, assumed to contain the cloud controlling factor (ccf)
    # as well as the surface area of the grid cells (cell_area)
    # output: for each cell and time step, the function return the binning index
    # of the cloud controlling factor
    
    # define surface area weights, which we add to output dataset for later use
    weights = np.broadcast_to(ds_in['cell_area'], ds_in['ccf'].shape)
    # define binning edges based on 
    bins_edges = np.linspace(binrange[0], binrange[1], nbins+1)
    # calculate centre of bins from the edges of the bins
    bins = bins_edges[1:] - 0.5*np.abs(bins_edges[1]-bins_edges[0])
    # for each entry of omega, ccf_indices gives the bin index it belongs to
    ccf_indices  = np.digitize(ds_in['ccf'], bins)
    
    # make an output dataset that contains ccf_indices, weights, bins and bins_edges
    ds_out = ( xr.Dataset(
               {'ccf_indices': (['time','ncells'], ccf_indices),
                'weights': (['time','ncells'], weights),},
                coords={'time': (['time'], ds_in.time),
                        'bins': (['bins'], bins),
                        'bins_edges': (['bins_edges'], bins_edges),
                        'clon': (['ncells'], ds_in.clon),
                        'clat': (['ncells'], ds_in.clat)},
    ) )
    ds_out['time'].attrs = ds_in['time'].attrs
    ds_out['bins'].attrs['units'] = ds_in['ccf'].attrs['units']
    ds_out['bins_edges'].attrs['units'] = ds_in['ccf'].attrs['units']
    
    # return dataset
    return ds_out

Function to sampled variable based on cloud controlling factor

In [7]:
def make_var_ccfsampled_2dvar(ds_ccfindex, ds_var, varname):
    # resample data on ccf bins using numpy for variables with input dimension time x ncells
    
    # number of bins
    nbins = ds_ccfindex['bins'].size
    var_sampled = np.zeros(nbins)
    
    # convert to numpy arrays
    indices = ds_ccfindex['ccf_indices'].values
    weights = ds_ccfindex['weights'].values
    var_x_weights =  ds_var[varname].values * weights
    
    for n in range(0, nbins):
        var_sampled[n] = ( np.nansum(var_x_weights[indices==n]) 
                              / np.nansum(weights[indices==n]) )
        
    return var_sampled

def make_var_ccfsampled_3dvar(ds_ccfindex, ds_var, varname):
    # resample data on ccf bins using numpy for variables with input dimension time x height x ncells
    
    # number of bins
    nbins = ds_ccfindex['bins'].size
    var_sampled = np.zeros((nbins, ds_var.height.size))
    
    # convert to numpy arrays
    indices = ds_ccfindex['ccf_indices'].values
    # var needs to be in shape time x ncells x height
    var = np.transpose(ds_var[varname].values, axes=[0,2,1])
    weights = ds_ccfindex['weights'].values
    var_x_weights =  var * np.expand_dims(weights, axis=2)
        
    for n in range(0, nbins):
        var_sampled[n] = ( np.nansum(var_x_weights[indices==n], axis=0) 
                              / np.nansum(weights[indices==n]) )
        
    return var_sampled

def make_var_ccfsampled(ds_ccfindex, ds_var, varname):
    if ds_var[varname].ndim == 2:    # 2d data
        var_sampled = make_var_ccfsampled_2dvar(ds_ccfindex, ds_var, varname)
    elif ds_var[varname].ndim == 3:  # 3d data
        var_sampled = make_var_ccfsampled_3dvar(ds_ccfindex, ds_var, varname)  
    return var_sampled

### Batch analysis over simulations, different cloud controlling factors and different variables

Define cloud controlling factor and to-be-sampled variable

First do the indexing of the grid cells according to the cloud controlling factor

In [41]:
ccf='t_g'

dict_binning={'omega':{'nbins':50, 'binrange': [-1.0, 1.0]},
              't_g'  :{'nbins':74, 'binrange': [273.0, 310.0]}}

for sim in list(simdict.keys()): 
    gridres = (simdict[sim])['res']
    if gridres != '5km' and gridres != '2km':
        print('Working on ', sim, 'with resolution', gridres)
        ds_ccf = prepare_ccf_dataset(sim, gridres, ccf)
        ds_ccf = nawut.drop_first_day(ds_ccf)
        ds_ccfindex = make_ccf_binindexing(ds_ccf[[ccf,'cell_area']].rename({ccf:'ccf'}), 
                                           nbins=dict_binning[ccf]['nbins'],
                                           binrange=dict_binning[ccf]['binrange'])
        ds_ccfindex.attrs['ccf'] = ccf
        ds_ccfindex.attrs['simulation'] = sim
        # store to zarr store
        zarr_store = '/scratch/b/b380459/icon_4_hackathon/ccf/'+sim+'_ccf_indexing_'+ccf+'.zarr'
        # remove any zarr_store with same name that might have been created previously
        shutil.rmtree(zarr_store, ignore_errors=True)
        ds_ccfindex.to_zarr(zarr_store)    
        del ds_ccfindex

Working on  nawdexnwp-10km-mis-0011 with resolution 10km
Working on  nawdexnwp-10km-mis-0012 with resolution 10km


Now sample variable on cloud conrolling factor using previously calculated grid cell indices

In [17]:
ccf='omega'
#ccf='t_g'
 
#var='ddt_temp_radlwclr_fromflux'  
var='clc'

for sim in ['nawdexnwp-10km-mis-0002']: #list(simdict.keys()): 
    gridres = (simdict[sim])['res']
    if gridres =='10km': # and gridres != '5km' and gridres != '2km':
        print('Working on ', sim, 'with resolution', gridres)
        ds_var  = prepare_var_dataset(sim, gridres, var)
        ds_var = nawut.drop_first_day(ds_var)
        # open previously calculated ccf indexing from zarr store
        zarr_store = '/scratch/b/b380459/icon_4_hackathon/ccf/'+sim+'_ccf_indexing_'+ccf+'.zarr'
        ds_ccfindex = xr.open_zarr(zarr_store)
        var_ccfsampled = make_var_ccfsampled(ds_ccfindex, ds_var, var)
        # store to netcdf file   
        if var_ccfsampled.ndim==1: # 2-d field
            var_ccfsampled = xr.DataArray(var_ccfsampled, dims=['bins'], coords={'bins': ds_ccfindex.bins})
        if var_ccfsampled.ndim==2: # 3-dfield
            var_ccfsampled = xr.DataArray(var_ccfsampled, dims=['bins', 'height'], 
                                          coords={'bins': ds_ccfindex.bins, 'height': ds_var.height})
        var_ccfsampled.name = var+'_ccfsampled'
        var_ccfsampled.attrs['ccf'] = ccf
        var_ccfsampled.attrs['description'] = var+' sampled on '+ccf+', only over open ocean'
        var_ccfsampled.attrs['simulation'] = sim
        var_ccfsampled.to_netcdf('/scratch/b/b380459/icon_4_hackathon/ccf/'+sim+'_'+var+'_sampled_on_'+ccf+'.nc', mode='w')

Working on  nawdexnwp-10km-mis-0002 with resolution 10km


## Cleaning up before leaving

In [None]:
client.shutdown()
client.close()