In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import numcodecs
from dask.distributed import Client, progress, LocalCluster
import zarr
import glob

In [2]:
cluster = LocalCluster()
client = Client(cluster)
client

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


0,1
Client  Scheduler: tcp://127.0.0.1:59010  Dashboard: http://127.0.0.1:59011/status,Cluster  Workers: 4  Cores: 4  Memory: 17.18 GB


In [3]:
# Tell blosc not to use threads since we are using dask to parallelize
numcodecs.blosc.use_threads = False

In [11]:
# !ls /Volumes/files/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1

In [4]:
# open the dataset
year = 2002
prefix = '/Volumes/files/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1'
start_doy = 152
end_doy = 157
chunks = {'lat': 500, 'lon': 500, 'time': 5}
store_dir = '/Volumes/Elements/zarr/test-500x500x5/'

In [6]:
ds = xr.open_zarr(store_dir)

In [17]:
?ds.sel

In [13]:
def generate_file_list(start_doy, end_doy):   
    days_of_year = list(range(start_doy, end_doy))
    fileObjs = []

    for doy in days_of_year:
        file = glob.glob(f"{prefix}/{year}/{doy}/*.nc")[0]
        fileObjs.append(file)
    return fileObjs

## Initiate store

In [14]:
fileObjs = generate_file_list(start_doy, end_doy)
print(fileObjs)
ds = xr.open_mfdataset(fileObjs, chunks=chunks, concat_dim='time', parallel=True, combine='by_coords')
ds_rechunk = ds.chunk(chunks=chunks)
ds_rechunk

['/Volumes/files/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/2002/152/20020601090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc', '/Volumes/files/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/2002/153/20020602090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc', '/Volumes/files/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/2002/154/20020603090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc', '/Volumes/files/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/2002/155/20020604090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc', '/Volumes/files/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/2002/156/20020605090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc']


<xarray.Dataset>
Dimensions:           (lat: 17999, lon: 36000, time: 5)
Coordinates:
  * lat               (lat) float32 -89.99 -89.98 -89.97 ... 89.97 89.98 89.99
  * lon               (lon) float32 -179.99 -179.98 -179.97 ... 179.99 180.0
  * time              (time) datetime64[ns] 2002-06-01T09:00:00 ... 2002-06-05T09:00:00
Data variables:
    analysed_sst      (time, lat, lon) float32 dask.array<chunksize=(5, 500, 500), meta=np.ndarray>
    analysis_error    (time, lat, lon) float32 dask.array<chunksize=(5, 500, 500), meta=np.ndarray>
    mask              (time, lat, lon) float32 dask.array<chunksize=(5, 500, 500), meta=np.ndarray>
    sea_ice_fraction  (time, lat, lon) float32 dask.array<chunksize=(5, 500, 500), meta=np.ndarray>
Attributes:
    Conventions:                CF-1.5
    title:                      Daily MUR SST, Final product
    summary:                    A merged, multi-sensor L4 Foundation SST anal...
    references:                 http://podaac.jpl.nasa.gov/Mu

In [15]:
compressor = zarr.Blosc(cname='zstd', clevel=5, shuffle=zarr.Blosc.AUTOSHUFFLE)
encoding = {v: {'compressor': compressor} for v in ds_rechunk.data_vars}

In [16]:
%%time
ds_rechunk.to_zarr(store_dir, consolidated=True, mode='w', encoding=encoding)

CPU times: user 9min 20s, sys: 29 s, total: 9min 49s
Wall time: 20min 54s


<xarray.backends.zarr.ZarrStore at 0x119971eb8>

## Loop and Append

In [18]:
%%time
start_doy = 157
end_doy = start_doy
number_days_to_append = 6
final_end_doy = start_doy + number_days_to_append
date_batch_size = 5

while start_doy < (final_end_doy - date_batch_size):
    start_doy = end_doy
    end_doy = start_doy + date_batch_size
    print(f"start doy: {start_doy}")
    print(f"end doy: {end_doy}")    
    fileObjs = generate_file_list(start_doy, end_doy)
    print(fileObjs)
    ds = xr.open_mfdataset(fileObjs, chunks=chunks, concat_dim='time', parallel=True, combine='by_coords')
    ds_rechunk = ds.chunk(chunks=chunks)
    ds_rechunk.to_zarr(store_dir, consolidated=True, mode='a', append_dim='time')
    print(f"Done with this batch")
    print()

start doy: 157
end doy: 162
['/Volumes/files/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/2002/157/20020606090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc', '/Volumes/files/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/2002/158/20020607090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc', '/Volumes/files/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/2002/159/20020608090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc', '/Volumes/files/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/2002/160/20020609090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc', '/Volumes/files/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/2002/161/20020610090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc']
Done with this batch

start doy: 162
end doy: 167
['/Volumes/files/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/2002/162/20020611090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc', '/Volumes/files/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/2002/163/20020612090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-