# Create a synthetic data set, store, load, rechunk, store

In [23]:
import xarray as xr
import numpy as np

---

## create a synthetic dataset

In [36]:
#nx, ny, nt = 1028, 1028, 1028*4
#nx, ny, nt = 128, 128, 256
nx, ny, nt = 32, 32, 64

ds = xr.Dataset({}, coords={'x':np.arange(nx),'y':np.arange(ny), 't': np.arange(nt)})

ds = ds.assign(v=ds.t*np.cos(np.pi/180./100*ds.x)*np.cos(np.pi/180./50*ds.y))

ds = ds.chunk({'t': 1, 'x': nx/2, 'y': ny/2})

#ds = ds.chunk({'x': 10, 'y': 10})

print(ds)

#ds.isel(t=-1).v.plot()

<xarray.Dataset>
Dimensions:  (t: 64, x: 32, y: 32)
Coordinates:
  * x        (x) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
  * y        (y) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
  * t        (t) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
Data variables:
    v        (t, x, y) float64 dask.array<shape=(64, 32, 32), chunksize=(1, 16, 16)>


In [22]:
#from dask.dot import dot_graph
#dot_graph(ds['v'].dask)
#dict(ds['v'].data.dask)

---

## store

In [31]:
ds.isel(t=0).to_zarr('data_t0.zarr', mode='w')
ds.to_zarr('data.zarr', mode='w')

<xarray.backends.zarr.ZarrStore at 0x2aaae3f9e860>

In [20]:
# load
ds = xr.open_zarr('data.zarr')
print(ds)

<xarray.Dataset>
Dimensions:  (t: 64, x: 32, y: 32)
Coordinates:
  * t        (t) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
  * x        (x) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
  * y        (y) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
Data variables:
    v        (t, x, y) float64 dask.array<shape=(64, 32, 32), chunksize=(1, 16, 16)>


In [21]:
print(ds.isel(t=0))
print(ds['v'].isel(t=0).encoding)
print(ds['t'].isel(t=0).encoding)
ds.isel(t=0).to_zarr('data_t0.zarr', mode='w')

<xarray.Dataset>
Dimensions:  (x: 32, y: 32)
Coordinates:
    t        int64 0
  * x        (x) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
  * y        (y) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
Data variables:
    v        (x, y) float64 dask.array<shape=(32, 32), chunksize=(16, 16)>
{'chunks': (1, 16, 16), 'compressor': Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0), 'filters': None, '_FillValue': nan, 'dtype': dtype('float64')}
{'chunks': (64,), 'compressor': Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0), 'filters': None, 'dtype': dtype('int64')}


ValueError: zarr chunks tuple (64,) must have same length as variable.ndim 0

In [22]:
del ds['v'].encoding['chunks']
del ds['t'].encoding['chunks']
print(ds.isel(t=0))
print(ds['v'].isel(t=0).encoding)
print(ds['t'].isel(t=0).encoding)
ds.isel(t=0).to_zarr('data_t0.zarr', mode='w')

<xarray.Dataset>
Dimensions:  (x: 32, y: 32)
Coordinates:
    t        int64 0
  * x        (x) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
  * y        (y) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
Data variables:
    v        (x, y) float64 dask.array<shape=(32, 32), chunksize=(16, 16)>
{'compressor': Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0), 'filters': None, '_FillValue': nan, 'dtype': dtype('float64')}
{'compressor': Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0), 'filters': None, 'dtype': dtype('int64')}


<xarray.backends.zarr.ZarrStore at 0x2aaae3f34f98>

In [19]:
ds = xr.open_zarr('data_t0.zarr')
print(ds)
print(ds.t)

<xarray.Dataset>
Dimensions:  (x: 32, y: 32)
Coordinates:
    t        int64 ...
  * x        (x) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
  * y        (y) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
Data variables:
    v        (x, y) float64 dask.array<shape=(32, 32), chunksize=(16, 16)>
<xarray.DataArray 't' ()>
array(0)
Coordinates:
    t        int64 ...


---

In [37]:
ds

<xarray.Dataset>
Dimensions:  (t: 64, x: 32, y: 32)
Coordinates:
  * x        (x) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
  * y        (y) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
  * t        (t) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
Data variables:
    v        (t, x, y) float64 dask.array<shape=(64, 32, 32), chunksize=(1, 16, 16)>

In [38]:
ds = ds.chunk({'t': nt, 'x': nx/4, 'y': ny/4})
ds.to_zarr('data_rechunked.zarr', mode='w')

<xarray.backends.zarr.ZarrStore at 0x2aab53e235c0>

In [39]:
ds = xr.open_zarr('data.zarr')
print(ds)

<xarray.Dataset>
Dimensions:  (t: 64, x: 32, y: 32)
Coordinates:
  * t        (t) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
  * x        (x) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
  * y        (y) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
Data variables:
    v        (t, x, y) float64 dask.array<shape=(64, 32, 32), chunksize=(1, 16, 16)>


In [40]:
ds = ds.chunk({'t': nt, 'x': nx/4, 'y': ny/4})
ds.to_zarr('data_rechunked.zarr', mode='w')

NotImplementedError: Specified zarr chunks (1, 16, 16) would overlap multiple dask chunks ((64,), (8, 8, 8, 8), (8, 8, 8, 8)). This is not implemented in xarray yet.  Consider rechunking the data using `chunk()` or specifying different chunks in encoding.

In [41]:
ds = xr.open_zarr('data.zarr')
ds = ds.chunk({'t': nt, 'x': nx/4, 'y': ny/4})
del ds['v'].encoding['chunks']
#del ds['t'].encoding['chunks']
ds.to_zarr('data_rechunked.zarr', mode='w')

<xarray.backends.zarr.ZarrStore at 0x2aab53ead6d8>