In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import numcodecs
from dask.distributed import Client, progress, LocalCluster
import zarr
import glob

In [2]:
cluster = LocalCluster()
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:46759  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 32  Memory: 132.24 GB


In [None]:
#xr.open_zarr('/data/mursst_netcdf/zarr/2x3000x7000')
# xr.open_mfdataset(
#     ['/data/mursst_netcdf/v4.1/2015/152/20150601090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc'],
#     combine='by_coords')

In [3]:
def generate_file_list(start_doy, end_doy):   
    """
    Given a start day and end end day, generate a list of file locations.
    Assumes a 'prefix' and 'year' variables have already been defined.
    'Prefix' should be a local directory or http url and path.
    'Year' should be a 4 digit year.
    """
    days_of_year = list(range(start_doy, end_doy))
    fileObjs = []

    for doy in days_of_year:
        file = glob.glob(f"{prefix}/{year}/{doy}/*.nc")[0]
        fileObjs.append(file)
    return fileObjs

In [4]:
year = 2006
start_doy = 152
end_doy = 252
prefix = '/data/mursst_netcdf/v4.1'
zarr_prefix = '/data/mursst_netcdf/zarr/'
fileObjs = generate_file_list(start_doy, end_doy)

In [11]:
import time
from datetime import datetime
chunk_opts = [
    {'time': 1, 'lat': 4000, 'lon': 8000},
    {'time': 2, 'lat': 3000, 'lon': 7000},
    {'time': 5, 'lat': 1800, 'lon': 3600},
    {'time': 10, 'lat': 1400, 'lon': 2800}    
]
test_results = {
    'netcdf': {},
    'zarr': {}
}

time_slice = slice(datetime.strptime(f"{year}-06-01", '%Y-%m-%d'), datetime.strptime(f"{year}-06-05", '%Y-%m-%d'))
lat_slice = slice(10, 50)
lon_slice = slice(-150, -100)

for chunks in chunk_opts:
    chunks_str = 'x'.join(map(str, chunks.values()))
    print(chunks_str)
    test_results['netcdf'][chunks_str] = {}
    test_results['zarr'][chunks_str] = {}
    ###############################
    ###############################
    # NETCDF Tests
    ###############################
    # Test opening the store
    ###############################
    s = time.time()
    ds_netcdf = xr.open_mfdataset(fileObjs, chunks=chunks, combine='by_coords', parallel=True)
    test_duration = time.time() - s
    print(f"time to open files for chunking {chunks_str}: {test_duration} seconds")
    test_results['netcdf'][chunks_str]['open'] = test_duration
    ###############################
    # Test averaging globally for a 5 day time slice
    ###############################
    s = time.time()
    ds_netcdf.analysed_sst.sel(time=time_slice).mean({'lat','lon'}).values
    test_duration = time.time() - s
    print(f"time to calculate mean for 5 day time slice for chunking {chunks_str}: {test_duration} seconds")
    test_results['netcdf'][chunks_str]['global 5 day slice'] = test_duration
    ###############################
    # Test averaging for 100 days for a lat/lon bbox
    ###############################
    s = time.time()
    ds_netcdf.analysed_sst.sel(lat=lat_slice, lon=lon_slice).mean({'lat','lon'}).values
    test_duration = time.time() - s
    print(f"time to calculate mean for lat/lon slice for chunking {chunks_str}: {test_duration} seconds")
    test_results['netcdf'][chunks_str]['bbox 100 day'] = test_duration
    ###############################
    ###############################
    ###############################
    # Zarr Tests
    ###############################
    # Test opening the store
    ###############################
    s = time.time()
    print(f"{zarr_prefix}{chunks_str}")
    ds_zarr = xr.open_zarr(f"{zarr_prefix}{chunks_str}")
    test_duration = time.time() - s
    print(f"time to open files for chunking {chunks_str}: {test_duration} seconds")
    test_results['zarr'][chunks_str]['open'] = test_duration
    ###############################
    # Test averaging globally for a 5 day time slice
    ###############################
    s = time.time()
    ds_zarr.analysed_sst.sel(time=time_slice).mean({'lat','lon'}).values
    test_duration = time.time() - s
    print(f"time to calculate mean for 5 day time slice for chunking {chunks_str}: {test_duration} seconds")
    test_results['zarr'][chunks_str]['global 5 day slice'] = test_duration
    ###############################
    # Test averaging for 100 days for a lat/lon bbox
    ###############################
    s = time.time()
    ds_zarr.analysed_sst.sel(lat=lat_slice, lon=lon_slice).mean({'lat','lon'}).values
    test_duration = time.time() - s
    print(f"time to calculate mean for lat/lon slice for chunking {chunks_str}: {test_duration} seconds")      
    test_results['zarr'][chunks_str]['bbox 100 day'] = test_duration
    

1x4000x8000
time to open files for chunking 1x4000x8000: 0.781994104385376 seconds
time to calculate mean for 5 day time slice for chunking 1x4000x8000: 4.7349512577056885 seconds
time to calculate mean for lat/lon slice for chunking 1x4000x8000: 4.255879640579224 seconds
/data/mursst_netcdf/zarr/1x4000x8000
time to open files for chunking 1x4000x8000: 0.2458817958831787 seconds
time to calculate mean for 5 day time slice for chunking 1x4000x8000: 2.9352259635925293 seconds
time to calculate mean for lat/lon slice for chunking 1x4000x8000: 2.650949001312256 seconds
2x3000x7000
time to open files for chunking 2x3000x7000: 0.9736356735229492 seconds
time to calculate mean for 5 day time slice for chunking 2x3000x7000: 4.60493278503418 seconds
time to calculate mean for lat/lon slice for chunking 2x3000x7000: 5.234242916107178 seconds
/data/mursst_netcdf/zarr/2x3000x7000
time to open files for chunking 2x3000x7000: 0.04452872276306152 seconds
time to calculate mean for 5 day time slice fo

In [16]:
import json
print(json.dumps(test_results, indent=2))

{
  "netcdf": {
    "1x4000x8000": {
      "open": 0.781994104385376,
      "global 5 day slice": 4.7349512577056885,
      "bbox 100 day": 4.255879640579224
    },
    "2x3000x7000": {
      "open": 0.9736356735229492,
      "global 5 day slice": 4.60493278503418,
      "bbox 100 day": 5.234242916107178
    },
    "5x1800x3600": {
      "open": 1.2085046768188477,
      "global 5 day slice": 5.894567489624023,
      "bbox 100 day": 7.596440553665161
    },
    "10x1400x2800": {
      "open": 1.5857770442962646,
      "global 5 day slice": 6.7964417934417725,
      "bbox 100 day": 6.2446510791778564
    }
  },
  "zarr": {
    "1x4000x8000": {
      "open": 0.2458817958831787,
      "global 5 day slice": 2.9352259635925293,
      "bbox 100 day": 2.650949001312256
    },
    "2x3000x7000": {
      "open": 0.04452872276306152,
      "global 5 day slice": 3.150022506713867,
      "bbox 100 day": 3.561424732208252
    },
    "5x1800x3600": {
      "open": 0.3231527805328369,
      "global 5