In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import numcodecs
from dask.distributed import Client, progress, LocalCluster
import zarr
import glob

In [2]:
cluster = LocalCluster()
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:46455  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 32  Memory: 132.24 GB


In [3]:
!ls /data/mursst_netcdf/zarr

1000x1000x5  10x1800x3600  1x1000x1000	1x4000x8000  5x1800x3600


In [5]:
#xr.open_zarr('/data/mursst_netcdf/zarr/1x1000x1000')

In [6]:
def generate_file_list(start_doy, end_doy):   
    """
    Given a start day and end end day, generate a list of file locations.
    Assumes a 'prefix' and 'year' variables have already been defined.
    'Prefix' should be a local directory or http url and path.
    'Year' should be a 4 digit year.
    """
    days_of_year = list(range(start_doy, end_doy))
    fileObjs = []

    for doy in days_of_year:
        if doy < 10:
            doy = f"00{doy}"
        elif doy >= 10 and doy < 100:
            doy = f"0{doy}"
        file = glob.glob(f"{prefix}/{year}/{doy}/*.nc")[0]
        fileObjs.append(file)
    return fileObjs

In [7]:
year = 2006
start_doy = 1
end_doy = 366
prefix = '/data/mursst_netcdf/v4.1'
zarr_prefix = '/data/mursst_netcdf/zarr/'
fileObjs = generate_file_list(start_doy, end_doy)

In [19]:
import time
from datetime import datetime
chunk_opts = [
    {'time': 1, 'lat': 4000, 'lon': 8000},
    {'time': 1, 'lat': 1000, 'lon': 1000},    
    {'lat': 1000, 'lon': 1000, 'time': 5},
    {'time': 5, 'lat': 1800, 'lon': 3600},
    {'time': 10, 'lat': 1800, 'lon': 3600}    
]
test_results = {
    'netcdf': {},
    'zarr': {}
}

time_slice = slice(datetime.strptime(f"{year}-06-01", '%Y-%m-%d'), datetime.strptime(f"{year}-06-05", '%Y-%m-%d'))
lat_slice = slice(10, 50)
lon_slice = slice(-150, -100)

for chunks in chunk_opts:
    chunks_str = 'x'.join(map(str, chunks.values()))
    test_results['netcdf'][chunks_str] = {}
    test_results['zarr'][chunks_str] = {}
    print("\nNETCDF TESTS:")
    ###############################
    ###############################
    # NETCDF Tests
    ###############################
    # Test opening the store
    ###############################
    s = time.time()
    ds_netcdf = xr.open_mfdataset(fileObjs, chunks=chunks, combine='by_coords', parallel=True)
    test_duration = time.time() - s
    print(f"time to open files for chunking {chunks_str}: {test_duration} seconds")
    test_results['netcdf'][chunks_str]['open'] = test_duration
    ###############################
    # Test averaging globally for a 5 day time slice
    ###############################
    s = time.time()
    ds_netcdf.analysed_sst.sel(time=time_slice).mean({'lat','lon'}).values
    test_duration = time.time() - s
    print(f"time to calculate mean for 5 day time slice for chunking {chunks_str}: {test_duration} seconds")
    test_results['netcdf'][chunks_str]['global 5 day slice'] = test_duration
    ###############################
    # Test averaging for 1 year for a lat/lon bbox
    ###############################
    s = time.time()
    ds_netcdf.analysed_sst.sel(lat=lat_slice, lon=lon_slice).mean({'lat','lon'}).values
    test_duration = time.time() - s
    print(f"time to calculate mean for lat/lon slice for chunking {chunks_str}: {test_duration} seconds")
    test_results['netcdf'][chunks_str]['bbox 1 year'] = test_duration
    ###############################
    ###############################
    # Zarr Tests
    ###############################
    print("\nZARR TESTS:")
    # Test opening the store
    ###############################
    s = time.time()
    ds_zarr = xr.open_zarr(f"{zarr_prefix}{chunks_str}")
    test_duration = time.time() - s
    print(f"time to open files for chunking {chunks_str}: {test_duration} seconds")
    test_results['zarr'][chunks_str]['open'] = test_duration
    ###############################
    # Test averaging globally for a 5 day time slice
    ###############################
    s = time.time()
    ds_zarr.analysed_sst.sel(time=time_slice).mean({'lat','lon'}).values
    test_duration = time.time() - s
    print(f"time to calculate mean for 5 day time slice for chunking {chunks_str}: {test_duration} seconds")
    test_results['zarr'][chunks_str]['global 5 day slice'] = test_duration
    ###############################
    # Test averaging for 1 year for a lat/lon bbox
    ###############################
    s = time.time()
    ds_zarr.analysed_sst.sel(lat=lat_slice, lon=lon_slice).mean({'lat','lon'}).values
    test_duration = time.time() - s
    print(f"time to calculate mean for lat/lon slice for chunking {chunks_str}: {test_duration} seconds")      
    test_results['zarr'][chunks_str]['bbox 1 year'] = test_duration
    


NETCDF TESTS:
time to open files for chunking 1x4000x8000: 2.6882407665252686 seconds
time to calculate mean for 5 day time slice for chunking 1x4000x8000: 4.6291892528533936 seconds
time to calculate mean for lat/lon slice for chunking 1x4000x8000: 15.776287078857422 seconds

ZARR TESTS:
time to open files for chunking 1x4000x8000: 0.12767696380615234 seconds
time to calculate mean for 5 day time slice for chunking 1x4000x8000: 3.0347349643707275 seconds
time to calculate mean for lat/lon slice for chunking 1x4000x8000: 9.877639055252075 seconds

NETCDF TESTS:
time to open files for chunking 1x1000x1000: 12.971092700958252 seconds
time to calculate mean for 5 day time slice for chunking 1x1000x1000: 13.798918724060059 seconds
time to calculate mean for lat/lon slice for chunking 1x1000x1000: 59.00486469268799 seconds

ZARR TESTS:
time to open files for chunking 1x1000x1000: 5.618469953536987 seconds
time to calculate mean for 5 day time slice for chunking 1x1000x1000: 8.0393409729003

In [18]:
import json
print(json.dumps(test_results, indent=2))

{
  "netcdf": {
    "1x4000x8000": {
      "open": 2.6454591751098633,
      "global 5 day slice": 4.864861726760864,
      "bbox 1 year": 15.742761373519897
    },
    "1x1000x1000": {
      "open": 12.08592963218689,
      "global 5 day slice": 13.844393730163574,
      "bbox 1 year": 59.879070520401
    },
    "1000x1000x5": {
      "open": 17.32360577583313,
      "global 5 day slice": 13.869495391845703,
      "bbox 1 year": 56.50943946838379
    },
    "5x1800x3600": {
      "open": 3.7167274951934814,
      "global 5 day slice": 5.786425590515137,
      "bbox 1 year": 27.096174478530884
    },
    "10x1800x3600": {
      "open": 3.4323573112487793,
      "global 5 day slice": 5.743644952774048,
      "bbox 1 year": 26.81326150894165
    }
  },
  "zarr": {
    "1x4000x8000": {
      "open": 0.11932897567749023,
      "global 5 day slice": 3.0444843769073486,
      "bbox 1 year": 9.824787616729736
    },
    "1x1000x1000": {
      "open": 9.184297561645508,
      "global 5 day sli