In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import numcodecs
from dask.distributed import Client, progress, LocalCluster
import zarr
import glob

In [2]:
xr.open_zarr('zarr/test')

<xarray.Dataset>
Dimensions:           (lat: 17999, lon: 36000, time: 20)
Coordinates:
  * lat               (lat) float32 -89.99 -89.98 -89.97 ... 89.97 89.98 89.99
  * lon               (lon) float32 -179.99 -179.98 -179.97 ... 179.99 180.0
  * time              (time) datetime64[ns] 2002-06-01T09:00:00 ... 2002-06-20T09:00:00
Data variables:
    analysed_sst      (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
    analysis_error    (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
    mask              (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
    sea_ice_fraction  (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
Attributes:
    Conventions:                CF-1.5
    Metadata_Conventions:       Unidata Observation Dataset v1.0
    acknowledgment:             Please acknowledge the use of these data with...
    cdm_data_type:              grid
    comment

In [3]:
cluster = LocalCluster()
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:65147  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 17.18 GB


In [4]:
# Tell blosc not to use threads since we are using dask to parallelize
numcodecs.blosc.use_threads = False

In [34]:
# open the dataset
year = 2002
prefix = '/Volumes/Elements/v4.1/'
days_of_year = list(range(152, 154))

In [7]:
fileObjs = []

for doy in days_of_year:
    file = glob.glob(f"{prefix}/{year}/{doy}/*.nc")[0]
    fileObjs.append(file)

print(fileObjs)
chunks = {'lat': 1000, 'lon': 1000}
ds = xr.open_mfdataset(fileObjs, chunks=chunks, concat_dim='time', parallel=True, combine='by_coords')
ds

['/Volumes/Elements/v4.1//2002/152/20020601090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc', '/Volumes/Elements/v4.1//2002/153/20020602090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc']


<xarray.Dataset>
Dimensions:           (lat: 17999, lon: 36000, time: 2)
Coordinates:
  * lon               (lon) float32 -179.99 -179.98 -179.97 ... 179.99 180.0
  * lat               (lat) float32 -89.99 -89.98 -89.97 ... 89.97 89.98 89.99
  * time              (time) datetime64[ns] 2002-06-01T09:00:00 2002-06-02T09:00:00
Data variables:
    analysed_sst      (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
    analysis_error    (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
    mask              (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
    sea_ice_fraction  (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
Attributes:
    Conventions:                CF-1.5
    title:                      Daily MUR SST, Final product
    summary:                    A merged, multi-sensor L4 Foundation SST anal...
    references:                 http://podaac.jpl.nasa.go

In [12]:
compressor = zarr.Blosc(cname='zstd', clevel=5, shuffle=zarr.Blosc.AUTOSHUFFLE)
encoding = {v: {'compressor': compressor} for v in ds.data_vars}

In [None]:
%%time
ds.to_zarr('zarr/test/', consolidated=True, mode='w', encoding=encoding)

In [8]:
days_of_year = list(range(154, 156))
fileObjs = []

for doy in days_of_year:
    file = glob.glob(f"{prefix}/{year}/{doy}/*.nc")[0]
    fileObjs.append(file)

print(fileObjs)
chunks = {'lat': 1000, 'lon': 1000}
ds = xr.open_mfdataset(fileObjs, chunks=chunks, concat_dim='time', parallel=True, combine='by_coords')
ds

['/Volumes/Elements/v4.1//2002/154/20020603090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc', '/Volumes/Elements/v4.1//2002/155/20020604090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc']


<xarray.Dataset>
Dimensions:           (lat: 17999, lon: 36000, time: 2)
Coordinates:
  * lon               (lon) float32 -179.99 -179.98 -179.97 ... 179.99 180.0
  * lat               (lat) float32 -89.99 -89.98 -89.97 ... 89.97 89.98 89.99
  * time              (time) datetime64[ns] 2002-06-03T09:00:00 2002-06-04T09:00:00
Data variables:
    analysed_sst      (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
    analysis_error    (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
    mask              (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
    sea_ice_fraction  (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
Attributes:
    Conventions:                CF-1.5
    title:                      Daily MUR SST, Final product
    summary:                    A merged, multi-sensor L4 Foundation SST anal...
    references:                 http://podaac.jpl.nasa.go

In [31]:
xr.open_zarr('zarr/test')

<xarray.Dataset>
Dimensions:           (lat: 17999, lon: 36000, time: 16)
Coordinates:
  * lat               (lat) float32 -89.99 -89.98 -89.97 ... 89.97 89.98 89.99
  * lon               (lon) float32 -179.99 -179.98 -179.97 ... 179.99 180.0
  * time              (time) datetime64[ns] 2002-06-01T09:00:00 ... 2002-06-16T09:00:00
Data variables:
    analysed_sst      (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
    analysis_error    (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
    mask              (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
    sea_ice_fraction  (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
Attributes:
    Conventions:                CF-1.5
    Metadata_Conventions:       Unidata Observation Dataset v1.0
    acknowledgment:             Please acknowledge the use of these data with...
    cdm_data_type:              grid
    comment

In [18]:
%%time
ds.to_zarr('zarr/test/', consolidated=True, mode='a', append_dim='time')

CPU times: user 55.5 s, sys: 3.3 s, total: 58.8 s
Wall time: 2min 50s


<xarray.backends.zarr.ZarrStore at 0x114e3bba0>

In [32]:
days_of_year = list(range(168, 172))
fileObjs = []

for doy in days_of_year:
    file = glob.glob(f"{prefix}/{year}/{doy}/*.nc")[0]
    fileObjs.append(file)

print(fileObjs)
chunks = {'lat': 1000, 'lon': 1000}
ds = xr.open_mfdataset(fileObjs, chunks=chunks, concat_dim='time', parallel=True, combine='by_coords')
ds

['/Volumes/Elements/v4.1//2002/168/20020617090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc', '/Volumes/Elements/v4.1//2002/169/20020618090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc', '/Volumes/Elements/v4.1//2002/170/20020619090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc', '/Volumes/Elements/v4.1//2002/171/20020620090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc']


<xarray.Dataset>
Dimensions:           (lat: 17999, lon: 36000, time: 4)
Coordinates:
  * lon               (lon) float32 -179.99 -179.98 -179.97 ... 179.99 180.0
  * lat               (lat) float32 -89.99 -89.98 -89.97 ... 89.97 89.98 89.99
  * time              (time) datetime64[ns] 2002-06-17T09:00:00 ... 2002-06-20T09:00:00
Data variables:
    analysed_sst      (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
    analysis_error    (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
    mask              (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
    sea_ice_fraction  (time, lat, lon) float32 dask.array<chunksize=(1, 1000, 1000), meta=np.ndarray>
Attributes:
    Conventions:                CF-1.5
    title:                      Daily MUR SST, Final product
    summary:                    A merged, multi-sensor L4 Foundation SST anal...
    references:                 http://podaac.jpl.nas

In [33]:
%%time
ds.to_zarr('zarr/test/', consolidated=True, mode='a', append_dim='time')

CPU times: user 2min 12s, sys: 9.1 s, total: 2min 21s
Wall time: 6min 54s


<xarray.backends.zarr.ZarrStore at 0x1212ecd00>