###### Benchmarking xarray netcdf vs xarray zarr using Dask Local cluster.
Here we experiment with number of workers.

Zarr shows better performance when scaling the workers while NetCDF seems to deteriorate, just a prelim look.

For general info on GFDL specific Python/Jupyter/Anaconda, please checkout http://wiki.gfdl.noaa.gov/Python. It also has information on port for
Data citation

Zhao, Ming; Blanton, Chris; John, Jasmin G; Radhakrishnan, Aparna; Zadeh, Niki T.; McHugh, Colleen; Rand, Kristopher; Vahlenkamp, Hans; Wilson, Chandin; Ginoux, Paul; Malyshev, Sergey; Wyman, Bruce; Guo, Huan; Balaji, V; Held, Isaac M; Dunne, John P.; Winton, Michael; Adcroft, Alistair; Milly, P.C.D; Shevliakova, Elena; Knutson, Thomas; Ploshay, Jeffrey; Zeng, Yujin (2018). NOAA-GFDL GFDL-CM4C192 model output prepared for CMIP6 HighResMIP. Version 20180701 [1].Earth System Grid Federation. https://doi.org/10.22033/ESGF/CMIP6.2262


Datasets used in this analysis attribute to 46GB

Computation time in seconds

Workers	Zarr	NetCDF
    10	    45	80
    20	31.6	72
    30	23.3	71
    40	23	73
    50	21.7	73
    60	20	74
    ..
    100	21.9	76
    150	24.1	78
    
Zarr chunk structure

Array	Chunk
Bytes	87.25 GB	8.29 MB
Shape	(105192, 360, 576)	(10, 360, 576)
Count	10521 Tasks	10520 Chunks
Type	float32	numpy.ndarray

User input: Dask analysis template

In [None]:
import os 
clusterType = "SLURM" #or SLURM or Local
platform = "PPAN"
numCores = 6
numWorkers = 10
mem='48GB'
varname='tas'
dashPort = ':1984'
if (platform == "PPAN"):
    logdir = '/work/a1r/dask/logs/'
    localdir = '' #We will set local directory when we define the cluster
else:
    logdir = '/local2/home/a1r/logs/'
    localdir = logdir
print("Logs:", logdir)

res_nc = {} #settings, time
res_zarr = {} #settings, time

In [None]:
# get the plot in notebook
%matplotlib inline
import xarray as xr 

Use dask.distributed task scheduler and launch DASK using SLURMcluster

In [None]:
from dask.distributed import Client

#Instantiate Dask client
if (clusterType == "local"):
    from dask.distributed import LocalCluster
    cluster = LocalCluster(dashboard_address=dashPort,local_directory=localdir)
else:
    from dask_jobqueue import SLURMCluster
    scheduler_options = {}
    scheduler_options["dashboard_address"] = dashPort
    cluster = SLURMCluster(queue='batch',memory=mem,project='gfdl_f',cores=numCores,walltime='2:60:00',
                           scheduler_options=scheduler_options,log_directory=logdir,
                           local_directory=(os.getenv('TMPDIR'))) 
    
cluster.scale(numWorkers)
client = Client(cluster)    
client

In [None]:
cluster.scale(40)

In [None]:
datadir = "/work/a1r/enes20/out/tas_3hr_2015-2051.zarr/"

In [None]:
%time ds4 = xr.open_zarr(datadir) 
temp_mean = ds4[varname].mean(dim=['time'])
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') 

In [None]:
ds4['tas'].data

Array	Chunk
Bytes	87.25 GB	8.29 MB
Shape	(105192, 360, 576)	(10, 360, 576)
Count	10521 Tasks	10520 Chunks
Type	float32	numpy.ndarray


In [None]:
datadir = '/work/a1r/enes20/CMIP6/HighResMIP/NOAA-GFDL/GFDL-CM4C192/highresSST-future/r1i1p1f1/3hr/'+varname+'/gr3/v20180701/'
%time ds5 = xr.open_mfdataset(datadir+"*.nc", chunks={'time': 10}, combine='by_coords',parallel=True,decode_cf=False)
temp_mean = ds5[varname].mean(dim=['time'])
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') 

In [None]:
ds5['tas'].data

In [None]:
cluster.scale(45)

In [None]:
temp_mean = ds5[varname].mean(dim=['time'])
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') 

In [None]:
temp_mean4 = ds4[varname].mean(dim=['time'])
%timeit temp_mean4.plot(figsize=[10, 8],cmap='gist_ncar') 

In [None]:
cluster.scale(40)

In [None]:
%timeit temp_mean4.plot(figsize=[10, 8],cmap='gist_ncar') #Zarr

In [None]:
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') #NetCDF

In [None]:
cluster.scale(50)
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') #NetCDF
%timeit temp_mean4.plot(figsize=[10, 8],cmap='gist_ncar') #Zarr

In [None]:
cluster.scale(20)
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') #NetCDF
%timeit temp_mean4.plot(figsize=[10, 8],cmap='gist_ncar') #Zarr

In [None]:
cluster.scale(30)
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') #NetCDF
%timeit temp_mean4.plot(figsize=[10, 8],cmap='gist_ncar') #Zarr

In [None]:
cluster.scale(10)
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') #NetCDF
%timeit temp_mean4.plot(figsize=[10, 8],cmap='gist_ncar') #Zarr

In [None]:
client.close()
cluster.close()

In [None]:
cluster.scale(2)

In [None]:
cluster.scale(20)
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') #NetCDF
%timeit temp_mean4.plot(figsize=[10, 8],cmap='gist_ncar') #Zarr

In [None]:
cluster.scale(2)#let's just say I don't know how else to test if its scaling down,easily
cluster.scale(40)
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') #NetCDF
%timeit temp_mean4.plot(figsize=[10, 8],cmap='gist_ncar') #Zarr


In [None]:
cluster.scale(2)#let's just say I don't know how else to test if its scaling down,easily
cluster.scale(50)
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') #NetCDF
%timeit temp_mean4.plot(figsize=[10, 8],cmap='gist_ncar') #Zarr


In [None]:
cluster.scale(2)#let's just say I don't know how else to test if its scaling down,easily
cluster.scale(45)
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') #NetCDF
%timeit temp_mean4.plot(figsize=[10, 8],cmap='gist_ncar') #Zarr


In [None]:
cluster.scale(2)#let's just say I don't know how else to test if its scaling down,easily
cluster.scale(20)
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') #NetCDF
%timeit temp_mean4.plot(figsize=[10, 8],cmap='gist_ncar') #Zarr


In [None]:
cluster.scale(2)#let's just say I don't know how else to test if its scaling down,easily
cluster.scale(60)
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') #NetCDF
%timeit temp_mean4.plot(figsize=[10, 8],cmap='gist_ncar') #Zarr


In [None]:
cluster.scale(2)#let's just say I don't know how else to test if its scaling down,easily
cluster.scale(50)
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') #NetCDF
%timeit temp_mean4.plot(figsize=[10, 8],cmap='gist_ncar') #Zarr


In [None]:
cluster.scale(2)#let's just say I don't know how else to test if its scaling down,easily
cluster.scale(60)
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') #NetCDF
%timeit temp_mean4.plot(figsize=[10, 8],cmap='gist_ncar') #Zarr

In [None]:
cluster.scale(2)#let's just say I don't know how else to test if its scaling down,easily
cluster.scale(90)
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') #NetCDF
%timeit temp_mean4.plot(figsize=[10, 8],cmap='gist_ncar') #Zarr

In [None]:
cluster.scale(2)
cluster.scale(100)
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') #NetCDF
%timeit temp_mean4.plot(figsize=[10, 8],cmap='gist_ncar') #Zarr

In [None]:
cluster.scale(2)
cluster.scale(150)
%timeit temp_mean.plot(figsize=[10, 8],cmap='gist_ncar') #NetCDF
%timeit temp_mean4.plot(figsize=[10, 8],cmap='gist_ncar') #Zarr