# Loop and Append to Zarr Store

In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import numcodecs
from dask.distributed import Client, progress, LocalCluster
import zarr
import glob

In [2]:
cluster = LocalCluster()
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:46815  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 32  Memory: 132.24 GB


In [3]:
# Tell blosc not to use threads since we are using dask to parallelize
numcodecs.blosc.use_threads = False

In [4]:
# Invariants
year = 2006
prefix = '/data/mursst_netcdf/v4.1'
chunks = {'time': 10, 'lat': 1800, 'lon': 3600}
path = 'x'.join(map(str, chunks.values()))
store_dir = f"/data/mursst_netcdf/zarr/{path}"
print(f"zarr store directory: {store_dir}")

zarr store directory: /data/mursst_netcdf/zarr/10x1800x3600


In [5]:
def generate_file_list(start_doy, end_doy):   
    """
    Given a start day and end end day, generate a list of file locations.
    Assumes a 'prefix' and 'year' variables have already been defined.
    'Prefix' should be a local directory or http url and path.
    'Year' should be a 4 digit year.
    """
    days_of_year = list(range(start_doy, end_doy))
    fileObjs = []

    for doy in days_of_year:
        if doy < 10:
            doy = f"00{doy}"
        elif doy >= 10 and doy < 100:
            doy = f"0{doy}"            
        file = glob.glob(f"{prefix}/{year}/{doy}/*.nc")[0]
        fileObjs.append(file)
    return fileObjs

## Loop and Append

In [None]:
%%time
start_doy = 11
end_doy = start_doy
number_batches_to_append = 36
batch_size = 10
final_end_doy = start_doy + (number_batches_to_append * batch_size)

while start_doy < final_end_doy:
    end_doy = start_doy + batch_size
    end_doy = min(366, end_doy)
    fileObjs = generate_file_list(start_doy, end_doy)
    print(f"start doy: {start_doy}, file: {fileObjs[0].split('/')[-1]}")
    print(f"end doy: {end_doy}, file: {fileObjs[-1].split('/')[-1]}")          
    ds = xr.open_mfdataset(fileObjs, chunks=chunks, parallel=True, combine='by_coords')
    ds_rechunk = ds.chunk(chunks=chunks)
    if start_doy == 1:
          ds_rechunk.to_zarr(store_dir, consolidated=True, mode='w')
    else:
          ds_rechunk.to_zarr(store_dir, consolidated=True, mode='a', append_dim='time')
    start_doy = end_doy
    print(f"Done with this batch")
    print()

start doy: 11, file: 20060111090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 21, file: 20060120090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 21, file: 20060121090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 31, file: 20060130090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 31, file: 20060131090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 41, file: 20060209090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 41, file: 20060210090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 51, file: 20060219090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 51, file: 20060220090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 61, file: 20060301090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 61, file: 20060302090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.n

In [None]:
xr.open_zarr(store_dir)

In [None]:
import time
# Add data test
zarr_ds = xr.open_zarr(store_dir)
start_doy = 1
end_doy = 366
fileObjs = generate_file_list(start_doy, end_doy)
netcdf_ds = xr.open_mfdataset(fileObjs, parallel=True, chunks=chunks, combine='by_coords')
lat_slice = slice(10, 50)
lon_slice = slice(-150, -100)

start_time = time.time()
result = netcdf_ds.analysed_sst.sel(lat=lat_slice, lon=lon_slice).mean({'lat', 'lon'}).values
netcdf_r0 = result[0]
netcdf_r1 = result[-1]
print(f"time to analyze netcdf: {time.time()-start_time}")

start_time = time.time()
result = list(zarr_ds.analysed_sst.sel(lat=lat_slice, lon=lon_slice).mean({'lat', 'lon'}).values)
zarr_r0 = result[0]
zarr_r1 = result[-1]
print(f"time to analyze zarr: {time.time()-start_time}")

assert(zarr_r0 == netcdf_r0)
assert(zarr_r1 == netcdf_r1)
print('tests passed')