# Benchmark Stratus Data Rates

We're interested in seeing how long it takes to create dynamically generated slices from Zarr stores located on different storage devices, namely Glade, Stratus, and AWS S3.  

The slices might be used for producing plots in interactive web applications run from NCAR web servers.

In [1]:
import xarray as xr
import dask

import fsspec
import s3fs

#from ncar_jobqueue import NCARCluster
from dask_jobqueue import PBSCluster
from distributed import Client

## Create and Connect to a Dask Distributed Cluster

### Casper Cluster Configuration

In [2]:
num_jobs = 35 #30 #10 #2 
walltime = "0:20:00"
memory='5GB' 

#cluster = NCARCluster(walltime=walltime, memory=memory)
cluster = PBSCluster(cores=1, processes=1, walltime=walltime, memory=memory, queue='casper', 
                     resource_spec='select=1:ncpus=1:mem=10GB',)
cluster.scale(jobs=num_jobs)

client = Client(cluster)
cluster

Tab(children=(HTML(value='\n            <div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-Ou…

In [None]:
# Use this to bail out if needed

cluster.close()

### Starting Point:  Load max daily temperatures from CESM LENS Data

#### Stratus as data source

In [3]:
s3_path = 's3://ncar-cesm-lens/atm/daily/cesmLE-RCP85-TREFHTMX.zarr' 
ds_stratus = xr.open_zarr(fsspec.get_mapper(s3_path, anon=True),
                  consolidated=True)

ds_stratus

Unnamed: 0,Array,Chunk
Bytes,541.80 kiB,270.91 kiB
Shape,"(34675, 2)","(17338, 2)"
Count,3 Tasks,2 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 541.80 kiB 270.91 kiB Shape (34675, 2) (17338, 2) Count 3 Tasks 2 Chunks Type object numpy.ndarray",2  34675,

Unnamed: 0,Array,Chunk
Bytes,541.80 kiB,270.91 kiB
Shape,"(34675, 2)","(17338, 2)"
Count,3 Tasks,2 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,285.71 GiB,121.50 MiB
Shape,"(40, 34675, 192, 288)","(1, 576, 192, 288)"
Count,2441 Tasks,2440 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 285.71 GiB 121.50 MiB Shape (40, 34675, 192, 288) (1, 576, 192, 288) Count 2441 Tasks 2440 Chunks Type float32 numpy.ndarray",40  1  288  192  34675,

Unnamed: 0,Array,Chunk
Bytes,285.71 GiB,121.50 MiB
Shape,"(40, 34675, 192, 288)","(1, 576, 192, 288)"
Count,2441 Tasks,2440 Chunks
Type,float32,numpy.ndarray


#### AWS as data source

In [4]:
# Connect to AWS S3 storage
fs = s3fs.S3FileSystem(anon=True)

# create a MutableMapping from a store URL
mapper = fs.get_mapper(s3_path)

# make sure to specify that metadata is consolidated
ds_aws = xr.open_zarr(mapper, consolidated=True)
ds_aws

Unnamed: 0,Array,Chunk
Bytes,541.80 kiB,270.91 kiB
Shape,"(34675, 2)","(17338, 2)"
Count,3 Tasks,2 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 541.80 kiB 270.91 kiB Shape (34675, 2) (17338, 2) Count 3 Tasks 2 Chunks Type object numpy.ndarray",2  34675,

Unnamed: 0,Array,Chunk
Bytes,541.80 kiB,270.91 kiB
Shape,"(34675, 2)","(17338, 2)"
Count,3 Tasks,2 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,285.71 GiB,121.50 MiB
Shape,"(40, 34675, 192, 288)","(1, 576, 192, 288)"
Count,2441 Tasks,2440 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 285.71 GiB 121.50 MiB Shape (40, 34675, 192, 288) (1, 576, 192, 288) Count 2441 Tasks 2440 Chunks Type float32 numpy.ndarray",40  1  288  192  34675,

Unnamed: 0,Array,Chunk
Bytes,285.71 GiB,121.50 MiB
Shape,"(40, 34675, 192, 288)","(1, 576, 192, 288)"
Count,2441 Tasks,2440 Chunks
Type,float32,numpy.ndarray


#### Glade as data source

In [5]:
zarr_folder = '/glade/scratch/bonnland/stratus-test/cesmLE-RCP85-TREFHTMX.zarr'

ds_glade = xr.open_zarr(zarr_folder, consolidated=True)
ds_glade

Unnamed: 0,Array,Chunk
Bytes,541.80 kiB,270.91 kiB
Shape,"(34675, 2)","(17338, 2)"
Count,3 Tasks,2 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 541.80 kiB 270.91 kiB Shape (34675, 2) (17338, 2) Count 3 Tasks 2 Chunks Type object numpy.ndarray",2  34675,

Unnamed: 0,Array,Chunk
Bytes,541.80 kiB,270.91 kiB
Shape,"(34675, 2)","(17338, 2)"
Count,3 Tasks,2 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,285.71 GiB,121.50 MiB
Shape,"(40, 34675, 192, 288)","(1, 576, 192, 288)"
Count,2441 Tasks,2440 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 285.71 GiB 121.50 MiB Shape (40, 34675, 192, 288) (1, 576, 192, 288) Count 2441 Tasks 2440 Chunks Type float32 numpy.ndarray",40  1  288  192  34675,

Unnamed: 0,Array,Chunk
Bytes,285.71 GiB,121.50 MiB
Shape,"(40, 34675, 192, 288)","(1, 576, 192, 288)"
Count,2441 Tasks,2440 Chunks
Type,float32,numpy.ndarray


In [6]:
def saveSlice(ds, start_year, end_year):
    '''Create and save time series for grid cell containing Boulder, Colorado.
    ''' 
    (lat, lon) = (40.01, 244.8)
    temps = ds.sel(lat=lat, lon=lon, method='nearest')
    temps = temps.sel(time=slice(f'{start_year}-01-01', f'{end_year}-12-31'))
    
    df = temps.TREFHTMX.to_dataframe()
    df.to_csv('Boulder_2010.csv', index=True)

### Choose slice parameters

In [7]:
start_year = 2007
#end_year = 2007
#end_year = 2017
end_year = 2057

###  Perform Benchmark Comparisons

In [8]:
%%time

# Test Glade performance
saveSlice(ds_glade, start_year, end_year)

CPU times: user 6.02 s, sys: 206 ms, total: 6.22 s
Wall time: 15.1 s


In [9]:
%%time

# Test Stratus performance
saveSlice(ds_stratus, start_year, end_year)

CPU times: user 8.24 s, sys: 315 ms, total: 8.55 s
Wall time: 58.3 s


In [10]:
%%time

# Test AWS S3 performance
saveSlice(ds_aws, start_year, end_year)

CPU times: user 7.74 s, sys: 322 ms, total: 8.06 s
Wall time: 48.1 s


In [None]:
cluster.close()