In [None]:
!pip install xarray

In [1]:
from dask.distributed import Client, progress, LocalCluster
from rechunker import rechunk
import s3fs
import xarray as xr
import zarr
import dask.array as dsa
import shutil
from dask.diagnostics import ProgressBar

## Start the Dask Cluster

In [2]:
cluster = LocalCluster(n_workers=20)
client = Client(cluster)
client
# client.close()
#cluster.shutdown()

0,1
Client  Scheduler: tcp://127.0.0.1:42319  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 20  Cores: 40  Memory: 130.02 GB


## Load the MUR SST Dataset from AWS PDS S3

In [3]:
s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='us-west-2'))
s3_store = s3fs.S3Map(root='mur-sst/zarr', s3=s3, check=False)
ds_zarr = zarr.open_consolidated(s3_store, mode='r') #, mask_and_scale=False) - Do we wa    nt mask_and_scale here?

In [None]:
%%time
ds = xr.open_zarr(s3_store, consolidated=True)

In [None]:
ds

In [None]:
ds.chunk({'time':6443, 'lat':50, 'lon':100})

## Toy example

In [None]:
subset = ds_zarr['analysed_sst'].get_orthogonal_selection((slice(0,100), slice(0,100), slice(0,100)))
baby_zarr_store_dir = 'data/example.zarr'
baby_zarr_store = zarr.DirectoryStore(baby_zarr_store_dir)
root = zarr.group(baby_zarr_store, overwrite=True)
root.create_dataset('analysed_sst', data=subset, chunks=(2,2,2))

In [None]:
source_array = root['analysed_sst']
source_array.chunks

In [None]:
target_chunks = (10,10,10)
max_mem = '1GB'
target_store = 'data/rechunked.zarr'
temp_store = 'data/rechunked-tmp.zarr'
    
array_plan = rechunk(
    source_array,
    target_chunks,
    max_mem,
    target_store,
    temp_store)
array_plan

In [None]:
future = array_plan.persist()
progress(future)

In [None]:
a = zarr.open(target_store)
a.chunks

In [None]:
dsa.from_zarr(target_store)

## A real example

In [4]:
#!pip install awscli

In [5]:
s3 = s3fs.S3FileSystem(client_kwargs=dict(region_name='us-east-1'))
s3_rechunk_store = s3fs.S3Map(root='aimee-pangeo-data/mursst/analysed_sst/6443x50x100.zarr', create=True, s3=s3)
s3_tmp_store = s3fs.S3Map(root='aimee-pangeo-data/mursst/tmp.zarr', create=True, s3=s3)

In [8]:
# target_chunks = {
#     'analysed_sst': {'time': 379, 'lat': 439, 'lon': 360},
#     'time': None, # don't rechunk this array
#     'lon': None,
#     'lat': None,
# }
target_chunks = {'time': 6443, 'lat': 50, 'lon': 100}
max_mem = '2GB'

array_plan = rechunk(ds_zarr['analysed_sst'], target_chunks, max_mem, s3_rechunk_store, s3_tmp_store)
array_plan

Unnamed: 0,Array,Chunk
Bytes,8.35 TB,2.00 GB
Shape,"(6443, 17999, 36000)","(6443, 50, 3100)"
Count,4321 Tasks,4320 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 8.35 TB 2.00 GB Shape (6443, 17999, 36000) (6443, 50, 3100) Count 4321 Tasks 4320 Chunks Type int16 numpy.ndarray",36000  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,8.35 TB,2.00 GB
Shape,"(6443, 17999, 36000)","(6443, 50, 3100)"
Count,4321 Tasks,4320 Chunks
Type,int16,numpy.ndarray


In [9]:
with ProgressBar():
    array_plan.execute()

KeyboardInterrupt: 