# Loop and Append to Zarr Store

Assumes store has been initiated.

In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import numcodecs
from dask.distributed import Client, progress, LocalCluster
import zarr
import glob

In [2]:
cluster = LocalCluster()
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:58301  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 17.18 GB


In [3]:
# Tell blosc not to use threads since we are using dask to parallelize
numcodecs.blosc.use_threads = False

In [4]:
# Invariants
year = 2002
prefix = '/Volumes/Elements/netcdf'
chunks = {'time': 10, 'lat': 1800, 'lon': 3600}
store_dir = '/Volumes/Elements/zarr/10x1800x3600-2/'

In [5]:
def generate_file_list(start_doy, end_doy):   
    """
    Given a start day and end end day, generate a list of file locations.
    Assumes a 'prefix' and 'year' variables have already been defined.
    'Prefix' should be a local directory or http url and path.
    'Year' should be a 4 digit year.
    """
    days_of_year = list(range(start_doy, end_doy))
    fileObjs = []

    for doy in days_of_year:
        file = glob.glob(f"{prefix}/{year}/{doy}/*.nc")[0]
        fileObjs.append(file)
    return fileObjs

## Loop and Append

In [13]:
%%time
start_doy = 252
end_doy = start_doy
number_batches_to_append = 2
batch_size = 100
final_end_doy = start_doy + (number_batches_to_append * batch_size)

while start_doy < final_end_doy:
    end_doy = start_doy + batch_size   
    fileObjs = generate_file_list(start_doy, end_doy)
    print(fileObjs[0])
    print(f"start doy: {start_doy}, file: {fileObjs[0].split('/')[-1]}")
    print(f"end doy: {end_doy}, file: {fileObjs[-1].split('/')[-1]}")          
    ds = xr.open_mfdataset(fileObjs, chunks=chunks, parallel=True, combine='by_coords')
    ds_rechunk = ds.chunk(chunks=chunks)
    if start_doy == 152:
          ds_rechunk.to_zarr(store_dir, consolidated=True, mode='w')
    else:
          ds_rechunk.to_zarr(store_dir, consolidated=True, mode='a', append_dim='time')
    start_doy = end_doy
    print(f"Done with this batch")
    print()


/Volumes/Elements/netcdf/2002/252/20020909090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
start doy: 252, file: 20020909090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 352, file: 20021217090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc




Done with this batch



IndexError: list index out of range

## Handling failures

In cases where the above method errors for whatever reason, we can create the subset on which the error occurred and then copy to the final datastore

In [None]:
start_doy = 177
end_doy = 182 
fileObjs = generate_file_list(start_doy, end_doy)
print(fileObjs[0])
print(f"start doy: {start_doy}, file: {fileObjs[0].split('/')[-1]}")
print(f"end doy: {end_doy}, file: {fileObjs[-1].split('/')[-1]}")          
ds = xr.open_mfdataset(fileObjs, chunks=chunks, parallel=True, combine='by_coords')
ds_rechunk = ds.chunk(chunks=chunks)
subset_source = '/Volumes/Elements/zarr/subset/'
# ds_rechunk.to_zarr(subset_source, consolidated=True, mode='w')

# existing_group = zarr.open(store=store_dir)
# subset_group = zarr.open(store=subset_source)
# zarr.copy(subset_group, existing_group, name='mursst', if_exists='replace')

# Now we test

In [None]:
from datetime import datetime

start_doy = 152
end_doy = 352
fileObjs = generate_file_list(start_doy, end_doy)
print(fileObjs[0])
print(f"start doy: {start_doy}, file: {fileObjs[0].split('/')[-1]}")
print(f"end doy: {end_doy}, file: {fileObjs[-1].split('/')[-1]}") 

In [43]:
time_slice = slice(datetime.strptime('2002-06-01', '%Y-%m-%d'), datetime.strptime('2002-07-31', '%Y-%m-%d'))

In [44]:
%%time
ds_netcdf = xr.open_mfdataset(fileObjs, chunks=chunks, parallel=True, combine='by_coords')
ds_netcdf.analysed_sst.sel(lat=slice(10,50), lon=slice(-150,-100), time=time_slice).mean({'lat','lon'}).values

CPU times: user 10.7 s, sys: 1.04 s, total: 11.8 s
Wall time: 53.6 s


array([293.13544, 293.1559 , 293.20792, 293.23874, 293.21085, 293.21237,
       293.19904, 293.19226, 293.18427, 293.1665 , 293.14133, 293.13684,
       293.22424, 293.3292 , 293.41763, 293.5121 , 293.59473, 293.65005,
       293.73868, 293.7867 , 293.81558, 293.84576, 293.86066, 293.88812,
       293.91266, 293.9739 , 294.02036, 294.09225, 294.12524, 294.2123 ,
       294.25143, 294.27628, 294.31146, 294.31287, 294.32843, 294.36154,
       294.43106, 294.4748 , 294.5472 , 294.63104, 294.6977 , 294.76428,
       294.7722 , 294.75235, 294.7109 , 294.704  , 294.70987, 294.7429 ,
       294.87073, 294.84946, 294.8339 , 294.8852 , 294.9408 , 295.00803,
       295.04187, 295.06644, 295.08246, 295.11935, 295.12418, 295.1287 ],
      dtype=float32)

In [42]:
%%time
existing_ds = xr.open_zarr(store_dir)
existing_ds.analysed_sst.sel(lat=slice(10,50),lon=slice(-150,-100), time=time_slice).mean({'lat','lon'}).values

CPU times: user 6.67 s, sys: 696 ms, total: 7.36 s
Wall time: 59.6 s


array([293.13544, 293.1559 , 293.20792, 293.23874, 293.21085, 293.21237,
       293.19904, 293.19226, 293.18427, 293.1665 , 293.14133, 293.13684,
       293.22424, 293.3292 , 293.41763, 293.5121 , 293.59473, 293.65005,
       293.73868, 293.7867 , 293.81558, 293.84576, 293.86066, 293.88812,
       293.91266, 293.9739 , 294.02036, 294.09225, 294.12524, 294.2123 ,
       294.25143, 294.27628, 294.31146, 294.31287, 294.32843, 294.36154,
       294.43106, 294.4748 , 294.5472 , 294.63104, 294.6977 , 294.76428,
       294.7722 , 294.75235, 294.7109 , 294.704  , 294.70987, 294.7429 ,
       294.87073, 294.84946, 294.8339 , 294.8852 , 294.9408 , 295.00803,
       295.04187, 295.06644, 295.08246, 295.11935, 295.12418, 295.1287 ,
       295.1586 , 295.48337, 295.47583, 295.46277, 295.4418 , 295.36972,
       295.4617 , 295.49762, 295.52625, 295.5996 , 295.69183, 295.8068 ,
       295.93246, 295.9977 , 296.00717, 296.0098 , 296.0006 , 296.03256,
       296.06427, 296.05164, 296.04807, 296.08508, 