# Loop and Append to Zarr Store

In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import numcodecs
from dask.distributed import Client, progress, LocalCluster
import zarr
import glob

In [2]:
cluster = LocalCluster()
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:46287  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 32  Memory: 132.24 GB


In [3]:
# Tell blosc not to use threads since we are using dask to parallelize
numcodecs.blosc.use_threads = False

In [23]:
!du -sh /data/mursst_netcdf/zarr*

64G	/data/mursst_netcdf/zarr_enc
80G	/data/mursst_netcdf/zarr_noenc


In [20]:
# Invariants
year = 2006
prefix = '/data/mursst_netcdf'
chunks = {'time': 5, 'lat': 1000, 'lon': 1000}
path = 'x'.join(map(str, chunks.values()))
do_encoding = True
store_dir = f"/data/mursst_netcdf/zarr_enc/{path}"
if not do_encoding:
    store_dir = f"/data/mursst_netcdf/zarr_noenc/{path}"
print(f"zarr store directory: {store_dir}")

zarr store directory: /data/mursst_netcdf/zarr_enc/5x1000x1000


In [21]:
def generate_file_list(start_doy, end_doy):   
    """
    Given a start day and end end day, generate a list of file locations.
    Assumes a 'prefix' and 'year' variables have already been defined.
    'Prefix' should be a local directory or http url and path.
    'Year' should be a 4 digit year.
    """
    days_of_year = list(range(start_doy, end_doy))
    fileObjs = []

    for doy in days_of_year:
        if doy < 10:
            doy = f"00{doy}"
        elif doy >= 10 and doy < 100:
            doy = f"0{doy}"            
        file = glob.glob(f"{prefix}/{year}/{doy}/*.nc")[0]
        fileObjs.append(file)
    return fileObjs

## Loop and Append

In [22]:
%%time
start_doy = 11
end_doy = start_doy
number_batches_to_append = 71
batch_size = 5
final_end_doy = start_doy + (number_batches_to_append * batch_size)

while start_doy < final_end_doy:
    end_doy = start_doy + batch_size
    end_doy = min(366, end_doy)
    fileObjs = generate_file_list(start_doy, end_doy)
    print(f"start doy: {start_doy}, file: {fileObjs[0].split('/')[-1]}")
    print(f"end doy: {end_doy}, file: {fileObjs[-1].split('/')[-1]}")          
    ds = xr.open_mfdataset(fileObjs, parallel=True, combine='by_coords')
    ds_rechunk = ds.chunk(chunks=chunks)
    args = {'consolidated': True}
    if start_doy == 1:
        args['mode'] = 'w'
        if do_encoding:
            compressor = zarr.Blosc(cname='zstd', clevel=5, shuffle=zarr.Blosc.AUTOSHUFFLE)
            encoding = {v: {'compressor': compressor, 'filters': [zarr.Delta(dtype=ds[v].dtype)]} for v in ds.data_vars}
            print(f"encoding is {encoding}")          
            args['encoding'] = encoding 
    else:
        args['mode'] = 'a'
        args['append_dim'] = 'time'
    ds_rechunk.to_zarr(store_dir, **args)
    start_doy = end_doy
    print(f"Done with this batch")
    print()

start doy: 11, file: 20060111090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 16, file: 20060115090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 16, file: 20060116090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 21, file: 20060120090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 21, file: 20060121090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 26, file: 20060125090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 26, file: 20060126090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 31, file: 20060130090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 31, file: 20060131090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 36, file: 20060204090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 36, file: 20060205090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.n



Done with this batch

start doy: 101, file: 20060411090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 106, file: 20060415090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc




Done with this batch

start doy: 106, file: 20060416090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 111, file: 20060420090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc




Done with this batch

start doy: 111, file: 20060421090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 116, file: 20060425090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc




Done with this batch

start doy: 116, file: 20060426090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 121, file: 20060430090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 121, file: 20060501090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 126, file: 20060505090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 126, file: 20060506090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 131, file: 20060510090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 131, file: 20060511090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 136, file: 20060515090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 136, file: 20060516090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 141, file: 20060520090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
Done with this batch

start doy: 141, file: 20060521090000-JPL-L4_GHRS



KilledWorker: ("('concatenate-open_dataset-rechunk-merge-b4bf63db46cc104725a393e9f9a1dd0d', 1, 0, 0)", <Worker 'tcp://127.0.0.1:43451', memory: 0, processing: 3>)

## Handling Failures

In cases where workers die in the middle of the loop, the code below can be used to re-run that chunk, using `if_exists='replace'`.

In [18]:
# start_doy = 160
# end_doy = 171 
# fileObjs = generate_file_list(start_doy, end_doy)
# print(f"start doy: {start_doy}, file: {fileObjs[0].split('/')[-1]}")
# print(f"end doy: {end_doy}, file: {fileObjs[-1].split('/')[-1]}")          
# ds = xr.open_mfdataset(fileObjs, parallel=True, combine='by_coords')
# ds_rechunk = ds.chunk(chunks=chunks)
# subset_source = '/Volumes/Elements/zarr/subset/'
# args = { 'consolidated': True, 'mode': 'w' }
# if do_encoding:
#       args['encoding'] = encoding
# ds_rechunk.to_zarr(subset_source, **args)

# existing_group = zarr.open(store=store_dir)
# subset_group = zarr.open(store=subset_source)
# zarr.copy(subset_group, existing_group, name='mursst', if_exists='replace')
# xr.open_zarr(store_dir)

start doy: 160, file: 20060609090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 171, file: 20060619090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc




KilledWorker: ("('concatenate-open_dataset-rechunk-merge-3da358605fca5bd3d9b70ecab0a1a971', 0, 0, 0)", <Worker 'tcp://127.0.0.1:45175', memory: 0, processing: 3>)



In [24]:
import time
# Add data test
zarr_ds = xr.open_zarr(store_dir)
start_doy = 1
end_doy = 171
fileObjs = generate_file_list(start_doy, end_doy)
netcdf_ds = xr.open_mfdataset(fileObjs, parallel=True, chunks=chunks, combine='by_coords')
lat_slice = slice(10, 50)
lon_slice = slice(-150, -100)

start_time = time.time()
result_n = netcdf_ds.analysed_sst.sel(lat=lat_slice, lon=lon_slice).mean({'lat', 'lon'}).values
netcdf_r0 = result_n[0]
netcdf_r1 = result_n[-1]
print(f"time to analyze netcdf: {time.time()-start_time}")

start_time = time.time()
result_z = list(zarr_ds.analysed_sst.sel(lat=lat_slice, lon=lon_slice).mean({'lat', 'lon'}).values)
zarr_r0 = result_z[0]
zarr_r1 = result_z[-1]
print(f"time to analyze zarr: {time.time()-start_time}")

assert(zarr_r0 == netcdf_r0)
assert(zarr_r1 == netcdf_r1)
print('tests passed')

time to analyze netcdf: 29.157246351242065
time to analyze zarr: 16.76199698448181


AssertionError: 

In [25]:
print(result_n)
result_z

[292.86786 292.81772 292.75568 292.6665  292.6309  292.65765 292.6813
 292.65872 292.61563 292.58444 292.56137 292.5526  292.53833 292.51093
 292.4789  292.4074  292.2918  292.201   292.17047 292.1562  292.12863
 292.05167 292.0264  292.03568 292.05746 292.0475  292.0221  291.98184
 291.97452 291.98547 292.0126  292.03088 292.01566 292.00558 292.00757
 292.01373 292.042   292.06146 292.06778 292.1171  292.06863 292.0157
 291.98782 291.994   291.9954  291.97852 291.93353 291.86035 291.80835
 291.79666 291.7943  291.777   291.7612  291.79187 291.84564 291.88785
 291.84534 291.88412 291.85757 291.81372 291.78186 291.74188 291.72934
 291.71744 291.7347  291.76794 291.76416 291.71783 291.65814 291.63898
 291.62604 291.59302 291.5331  291.49637 291.50555 291.50528 291.51285
 291.47717 291.43115 291.41785 291.4563  291.51373 291.58276 291.64777
 291.69806 291.7271  291.75818 291.77322 291.78235 291.76813 291.7475
 291.73996 291.75925 291.7921  291.81244 291.79288 291.77832 291.80136
 291.8415

[293.45883,
 293.0144,
 292.92746,
 292.8175,
 292.77072,
 293.2217,
 292.8295,
 292.8111,
 292.77795,
 292.7507,
 293.1191,
 292.70792,
 292.67822,
 292.64084,
 292.60403,
 292.94064,
 292.41724,
 292.33786,
 292.31158,
 292.29578,
 292.68167,
 292.1804,
 292.1362,
 292.1302,
 292.1468,
 292.58273,
 292.10504,
 292.07504,
 292.07672,
 292.08643,
 292.55182,
 292.14117,
 292.13153,
 292.12308,
 292.12326,
 292.55746,
 292.1386,
 292.12616,
 292.1107,
 292.14777,
 292.5587,
 292.06686,
 292.03748,
 292.03915,
 292.03445,
 292.4889,
 291.9901,
 291.94357,
 291.92303,
 291.92215,
 292.35214,
 291.9231,
 291.9167,
 291.94995,
 291.99985,
 292.45718,
 291.98532,
 292.01828,
 291.97662,
 291.92462,
 292.34677,
 291.87418,
 291.86194,
 291.8444,
 291.85168,
 292.34406,
 291.90182,
 291.87097,
 291.82013,
 291.79956,
 292.2226,
 291.7453,
 291.68057,
 291.63577,
 291.64673,
 292.09244,
 291.6397,
 291.60272,
 291.56516,
 291.56085,
 292.04266,
 291.63303,
 291.69144,
 291.75287,
 291.802,
 292

In [28]:
import time
# Add data test
zarr_noenc = xr.open_zarr('/data/mursst_netcdf/zarr_noenc/5x1000x1000')
zarr_enc = xr.open_zarr('/data/mursst_netcdf/zarr_enc/5x1000x1000')
# lat_slice = slice(10, 50)
# lon_slice = slice(-150, -100)
#zarr_enc

In [29]:
start_time = time.time()
result1 = zarr_noenc.analysed_sst.mean({'lat', 'lon'}).values
print(f"time to analyze zarr with default encoding: {time.time()-start_time}")

start_time = time.time()
result2 = zarr_enc.analysed_sst.mean({'lat', 'lon'}).values
print(f"time to analyze zarr with custom encoding: {time.time()-start_time}")

print(result1)
print(result2)

time to analyze zarr with default encoding: 287.48798418045044
time to analyze zarr with custom encoding: 238.89880871772766
[286.82858 286.8154  286.8097  286.81888 286.84558 286.87357 286.8892
 286.89374 286.89636 286.9005  286.91748 286.93433 286.94803 286.95895
 286.96252 286.95325 286.92905 286.91135 286.89682 286.8866  286.8853
 286.88806 286.88403 286.8753  286.8758  286.88635 286.89218 286.90402
 286.9209  286.9284  286.93304 286.94116 286.9459  286.9502  286.95682
 286.96182 286.96262 286.95557 286.94293 286.92947 286.93164 286.94217
 286.95563 286.96225 286.97913 286.9999  287.0115  287.01614 287.02094
 287.02893 287.03705 287.0415  287.04022 287.03833 287.03632 287.0321
 287.0045  287.01553 287.01904 287.01953 287.02713 287.0286  287.01553
 287.00815 287.003   286.99857 286.99594 286.99353 286.99158 286.9858
 286.97614 286.9658  286.94604 286.93835 286.93692 286.94064 286.94543
 286.9537  286.95422 286.9468  286.93683 286.92477 286.91632 286.90884
 286.90277 286.89865 286.89