# Loop and Append to Zarr Store

Assumes store has been initiated.

In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import numcodecs
from dask.distributed import Client, progress, LocalCluster
import zarr
import glob

In [2]:
cluster = LocalCluster()
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:65204  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 17.18 GB


In [3]:
# Tell blosc not to use threads since we are using dask to parallelize
numcodecs.blosc.use_threads = False

In [4]:
# Invariants
year = 2002
prefix = '/Volumes/Elements/netcdf'
chunks = {'time': 100, 'lat': 500, 'lon': 1000}
store_dir = '/Volumes/Elements/zarr/100x500x1000-1/'

In [5]:
# open the existing data
existing_ds = xr.open_zarr(store_dir)
existing_ds

<xarray.Dataset>
Dimensions:           (lat: 17999, lon: 36000, time: 80)
Coordinates:
  * lat               (lat) float32 -89.99 -89.98 -89.97 ... 89.97 89.98 89.99
  * lon               (lon) float32 -179.99 -179.98 -179.97 ... 179.99 180.0
  * time              (time) datetime64[ns] 2002-06-01T09:00:00 ... 2002-08-19T09:00:00
Data variables:
    analysed_sst      (time, lat, lon) float32 dask.array<chunksize=(10, 500, 1000), meta=np.ndarray>
    analysis_error    (time, lat, lon) float32 dask.array<chunksize=(10, 500, 1000), meta=np.ndarray>
    mask              (time, lat, lon) float32 dask.array<chunksize=(10, 500, 1000), meta=np.ndarray>
    sea_ice_fraction  (time, lat, lon) float32 dask.array<chunksize=(10, 500, 1000), meta=np.ndarray>
Attributes:
    Conventions:                CF-1.5
    Metadata_Conventions:       Unidata Observation Dataset v1.0
    acknowledgment:             Please acknowledge the use of these data with...
    cdm_data_type:              grid
    comment

In [6]:
def generate_file_list(start_doy, end_doy):   
    """
    Given a start day and end end day, generate a list of file locations.
    Assumes a 'prefix' and 'year' variables have already been defined.
    'Prefix' should be a local directory or http url and path.
    'Year' should be a 4 digit year.
    """
    days_of_year = list(range(start_doy, end_doy))
    fileObjs = []

    for doy in days_of_year:
        file = glob.glob(f"{prefix}/{year}/{doy}/*.nc")[0]
        fileObjs.append(file)
    return fileObjs

## Loop and Append

In [None]:
%%time
start_doy = 152
end_doy = start_doy
number_batches_to_append = 8
batch_size = 10
final_end_doy = start_doy + (number_batches_to_append * batch_size)

while start_doy < final_end_doy:
    end_doy = start_doy + batch_size   
    fileObjs = generate_file_list(start_doy, end_doy)
    print(fileObjs[0])
    print(f"start doy: {start_doy}, file: {fileObjs[0].split('/')[-1]}")
    print(f"end doy: {end_doy}, file: {fileObjs[-1].split('/')[-1]}")          
    ds = xr.open_mfdataset(fileObjs, chunks=chunks, parallel=True, combine='by_coords')
    ds_rechunk = ds.chunk(chunks=chunks)
    if start_doy == 152:
        ds_rechunk.to_zarr(store_dir, consolidated=True, mode='w')
    else:
        ds_rechunk.to_zarr(store_dir, consolidated=True, mode='a', append_dim='time')  
    start_doy = end_doy
    print(f"Done with this batch")
    print()


# Now we test

In [7]:
start_doy = 152
end_doy = 232
fileObjs = generate_file_list(start_doy, end_doy)
print(fileObjs[0])
print(f"start doy: {start_doy}, file: {fileObjs[0].split('/')[-1]}")
print(f"end doy: {end_doy}, file: {fileObjs[-1].split('/')[-1]}")          
ds_netcdf = xr.open_mfdataset(fileObjs, chunks=chunks, parallel=True, combine='by_coords')

/Volumes/Elements/netcdf/2002/152/20020601090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
start doy: 152, file: 20020601090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 232, file: 20020819090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc


In [8]:
%%time
ds_netcdf.analysed_sst.sel(lat=slice(10,50),lon=slice(-150,-140)).mean({'lat','lon'}).values

CPU times: user 9.4 s, sys: 681 ms, total: 10.1 s
Wall time: 25.1 s


array([292.37662, 292.4691 , 292.5197 , 292.58658, 292.53912, 292.51685,
       292.52118, 292.5701 , 292.6216 , 292.67664, 292.68253, 292.67676,
       292.7034 , 292.79172, 292.84357, 292.91672, 292.99316, 293.0918 ,
       293.18192, 293.1904 , 293.16492, 293.17023, 293.19888, 293.19775,
       293.19473, 293.2463 , 293.29922, 293.33945, 293.3479 , 293.45016,
       293.52686, 293.60767, 293.7037 , 293.693  , 293.68152, 293.67117,
       293.66757, 293.63553, 293.67514, 293.73798, 293.82193, 293.9002 ,
       293.92432, 293.92905, 293.92215, 293.88916, 293.87814, 294.13165,
       294.18274, 294.18027, 294.23386, 294.29837, 294.3725 , 294.48947,
       294.53397, 294.55908, 294.60065, 294.63242, 294.669  , 294.67908,
       294.6661 , 294.96368, 294.96442, 294.9959 , 294.989  , 294.9685 ,
       295.1215 , 295.1334 , 295.15146, 295.23117, 295.28967, 295.3094 ,
       295.40082, 295.52194, 295.6026 , 295.67233, 295.70416, 295.7932 ,
       295.86548, 295.88718], dtype=float32)

In [9]:
%%time

existing_ds.analysed_sst.sel(lat=slice(10,50),lon=slice(-150,-140)).mean({'lat','lon'}).values

CPU times: user 2.5 s, sys: 246 ms, total: 2.75 s
Wall time: 13.1 s


array([292.37662, 292.4691 , 292.5197 , 292.58658, 292.53912, 292.51685,
       292.52118, 292.5701 , 292.6216 , 292.67664, 292.68253, 292.67676,
       292.7034 , 292.79172, 292.84357, 292.91672, 292.99316, 293.0918 ,
       293.18192, 293.1904 , 293.16492, 293.17023, 293.19888, 293.19775,
       293.19473, 293.2463 , 293.29922, 293.33945, 293.3479 , 293.45016,
       293.52686, 293.60767, 293.7037 , 293.693  , 293.68152, 293.67117,
       293.66757, 293.63553, 293.67514, 293.73798, 293.82193, 293.9002 ,
       293.92432, 293.92905, 293.92215, 293.88916, 293.87814, 294.13165,
       294.18274, 294.18027, 294.23386, 294.29837, 294.3725 , 294.48947,
       294.53397, 294.55908, 294.60065, 294.63242, 294.669  , 294.67908,
       294.6661 , 294.96368, 294.96442, 294.9959 , 294.989  , 294.9685 ,
       295.1215 , 295.1334 , 295.15146, 295.23117, 295.28967, 295.3094 ,
       295.40082, 295.52194, 295.6026 , 295.67233, 295.70416, 295.7932 ,
       295.86548, 295.88718], dtype=float32)