# rewrite data with larger chunks along time

Use [rechunker](https://github.com/pangeo-data/rechunker), see [doc](https://rechunker.readthedocs.io/en/latest/)


In [1]:
import os, sys
from shutil import rmtree
from tqdm import tqdm

import numpy as np
import dask
import xarray as xr
from matplotlib import pyplot as plt
%matplotlib inline

from rechunker import rechunk

import mitequinox.utils as ut
import mitequinox.sigp as sp

In [2]:
from dask_jobqueue import PBSCluster
#cluster = PBSCluster(cores=1)
cluster = PBSCluster(cores=2, processes=1, walltime='06:00:00')
#print(cluster.job_script())
w = cluster.scale(jobs=10)

# get dask handles and check dask server status
from dask.distributed import Client
client = Client(cluster)

In [4]:
cluster

VBox(children=(HTML(value='<h2>PBSCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .d…

---
# load standard lay out, rechunk and store right away

In [6]:
# same but over all variables and faces
V = ['SSU', 'SSV']
#V = ['SST', 'SSS']

# parameters for rechunking: time and space
#Nt0, Nc = 24*20, (288, 96) # file size ~ 
#Nt0, Nc = 24*160, (72, 48) # file size ~ 
#Nt0, Nc = 0, (72, 24) # file size ~ XMB files
chunks = (0, 72, 48) # file size ~ XMB files
#
subsampling = 2

df = ut.load_common_timeline(V)

2011-11-15 00:00:00  to  2012-11-15 00:00:00


In [12]:
for v in tqdm(V):
    
    print(' --- '+v)

    ds = ut.load_data(v)
    
    # cannot process all faces at the same time with the full grid
    #for face in ds.face:
    #    %time transpose_rechunk(ds, chunks, df['iter'], face=face, subsampling=subsampling, overwrite=False, max_mem='25GB')
    
    # but can with a subsampled grid (subsample=2)
    sp.transpose_rechunk(ds, 
                         chunks, 
                         df['iter'], 
                         subsampling=subsampling, 
                         overwrite=False, 
                         max_mem='25GB',
                        )

  0%|          | 0/2 [00:00<?, ?it/s]

 --- SSU


  (<zarr.core.Array '/SSU' (8784, 13, 2160, 2160) fl ... 83, 12, 0, 0)")
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good
 50%|█████     | 1/2 [55:15<55:15, 3315.37s/it]

 rechunking over
 --- SSV


100%|██████████| 2/2 [1:47:23<00:00, 3221.54s/it]

 rechunking over





In [13]:
cluster.close()

In [14]:
ds

Unnamed: 0,Array,Chunk
Bytes,70.28 kB,70.28 kB
Shape,"(8785,)","(8785,)"
Count,2 Tasks,1 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 70.28 kB 70.28 kB Shape (8785,) (8785,) Count 2 Tasks 1 Chunks Type datetime64[ns] numpy.ndarray",8785  1,

Unnamed: 0,Array,Chunk
Bytes,70.28 kB,70.28 kB
Shape,"(8785,)","(8785,)"
Count,2 Tasks,1 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,70.28 kB,8 B
Shape,"(8785,)","(1,)"
Count,8786 Tasks,8785 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 70.28 kB 8 B Shape (8785,) (1,) Count 8786 Tasks 8785 Chunks Type int64 numpy.ndarray",8785  1,

Unnamed: 0,Array,Chunk
Bytes,70.28 kB,8 B
Shape,"(8785,)","(1,)"
Count,8786 Tasks,8785 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,8.53 TB,74.65 MB
Shape,"(8785, 13, 4320, 4320)","(1, 1, 4320, 4320)"
Count,114206 Tasks,114205 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 8.53 TB 74.65 MB Shape (8785, 13, 4320, 4320) (1, 1, 4320, 4320) Count 114206 Tasks 114205 Chunks Type float32 numpy.ndarray",8785  1  4320  4320  13,

Unnamed: 0,Array,Chunk
Bytes,8.53 TB,74.65 MB
Shape,"(8785, 13, 4320, 4320)","(1, 1, 4320, 4320)"
Count,114206 Tasks,114205 Chunks
Type,float32,numpy.ndarray


---

## dev

In [7]:
out_dir = work_data_dir+'rechunked/'  # /work/ALT/swot/aval/syn/rechunked/
#out_dir = '/work/scratch/pontea/' # 2TB max
overwrite = True

v = V[0]
print(v)

ds = load_data(v)

# select common time line
t0 = ds['time'].where(ds.iters==df['iter'][0],drop=True).values[0]
t1 = ds['time'].where(ds.iters==df['iter'][-1],drop=True).values[0]
ds = ds.sel(time=slice(t0,t1))

ds['dtime'] = ds['dtime'].compute()
ds['iters'] = ds['iters'].compute()

face=1
ds = ds.isel(face=face)

Nt = len(ds.time)-1 if Nt0 == 0 else Nt0
# -1 is to obtain 8784 which you can divide by 4**2

# necessary ?
ds = ds.isel(time=slice(len(ds.time)//Nt *Nt))

SSU


In [15]:
d = get_chunks((Nt,)+Nc, v, 0)

{'time': 8784, 'face': 1, 'i_g': 72, 'j': 24}
Individual chunk size = 60.7 MB


In [16]:
get_chunks((Nt,)+Nc, v, 1)

{'time': 2196, 'face': 1, 'i_g': 144, 'j': 48}
Individual chunk size = 60.7 MB


{'SSU': {'time': 2196, 'face': 1, 'i_g': 144, 'j': 48},
 'time': None,
 'face': None,
 'i_g': None,
 'j': None,
 'dtime': (-1,),
 'iters': (-1,)}

In [17]:
get_chunks((Nt,)+Nc, v, 2)

{'time': 549, 'face': 1, 'i_g': 288, 'j': 96}
Individual chunk size = 60.7 MB


{'SSU': {'time': 549, 'face': 1, 'i_g': 288, 'j': 96},
 'time': None,
 'face': None,
 'i_g': None,
 'j': None,
 'dtime': (-1,),
 'iters': (-1,)}

In [19]:
# rechunker parameters
max_mem = '30GB'
target_store = out_dir+v+'.zarr'
temp_store = out_dir+v+'-tmp.zarr'

# clean archives if necessary
if os.path.isdir(target_store):
    if overwrite:
        rmtree(target_store)
    else:
        assert False, 'Archive exists and you do not want to overwrite'
rmtree(temp_store, ignore_errors=True)

# init rechunker
target_chunks = get_chunks((Nt,)+Nc, v, 1)
r = rechunk(ds, target_chunks, max_mem, target_store, temp_store=temp_store)

### 1 pass:
# 100GB: (1, 648, 4320) # 5MB file size 
# 50GB: (1, 312, 4320) # 5MB file size 
# 20GB: (1, 120, 4320)
# 10GB: (1, 48, 4320)
# 1GB: (1, 24, 1152)

### 2 pass - step 1:
# 40GB: (1, 1008, 4320) # 14MB file size
# 30GB: (1, 768, 4320) # 11MB file size

#r._intermediate['SSU'].info
print_rechunk(r, v)

{'time': 2196, 'face': 1, 'i_g': 144, 'j': 48}
Individual chunk size = 60.7 MB
Source data size: 		 8784x4320x4320 	 655.7GB
Source chunk size: 		 1x4320x4320 		 74.6MB
Source number of files: 		 8784
Intermediate chunk size: 	 1x768x4320 		 13.3MB
Intermediate number of files: 		 49410
Target chunk size: 		 2196x48x144 		 60.7MB
Target number of files: 		 10800


Full transposition

```
{'time': 8784, 'face': 1, 'i_g': 72, 'j': 24}
Individual chunk size = 15.2 MB
Source data size: 		 8784x4320x4320 	 655.7GB
Source chunk size: 		 1x4320x4320 		 74.6MB
Source number of files: 		 	 8784
Intermediate chunk size: 	 1x192x4320 		 3.3MB
Intermediate number of files: 		 197640
Target chunk size: 		 8784x24x72 		 60.7MB
Target number of files: 		 	 10800
```

Fractional transposition

```
{'time': 2196, 'face': 1, 'i_g': 144, 'j': 48}
Individual chunk size = 15.2 MB
Source data size: 		 8784x4320x4320 	 655.7GB
Source chunk size: 		 1x4320x4320 		 74.6MB
Source number of files: 			 8784
Intermediate chunk size: 	 1x768x4320 		 13.3MB
Intermediate number of files: 		 49410
Target chunk size: 		 2196x48x144 		 60.7MB
Target number of files: 			 10800
```

In [20]:
%time result = r.execute()

CPU times: user 2min 40s, sys: 7.77 s, total: 2min 48s
Wall time: 20min 50s


In [45]:
result['SSU'].info

0,1
Name,/SSU
Type,zarr.core.Array
Data type,float32
Shape,"(8784, 4320, 4320)"
Chunk shape,"(2196, 48, 144)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,655722086400 (610.7G)


In [33]:
target_store

'/work/scratch/pontea//SSU.zarr'

In [32]:
ds_out = xr.open_zarr(target_store)
ds_out

Unnamed: 0,Array,Chunk
Bytes,70.28 kB,70.28 kB
Shape,"(8785,)","(8785,)"
Count,2 Tasks,1 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 70.28 kB 70.28 kB Shape (8785,) (8785,) Count 2 Tasks 1 Chunks Type datetime64[ns] numpy.ndarray",8785  1,

Unnamed: 0,Array,Chunk
Bytes,70.28 kB,70.28 kB
Shape,"(8785,)","(8785,)"
Count,2 Tasks,1 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,70.28 kB,70.28 kB
Shape,"(8785,)","(8785,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 70.28 kB 70.28 kB Shape (8785,) (8785,) Count 2 Tasks 1 Chunks Type int64 numpy.ndarray",8785  1,

Unnamed: 0,Array,Chunk
Bytes,70.28 kB,70.28 kB
Shape,"(8785,)","(8785,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,655.80 GB,74.65 MB
Shape,"(8785, 4320, 4320)","(1, 4320, 4320)"
Count,8786 Tasks,8785 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 655.80 GB 74.65 MB Shape (8785, 4320, 4320) (1, 4320, 4320) Count 8786 Tasks 8785 Chunks Type float32 numpy.ndarray",4320  4320  8785,

Unnamed: 0,Array,Chunk
Bytes,655.80 GB,74.65 MB
Shape,"(8785, 4320, 4320)","(1, 4320, 4320)"
Count,8786 Tasks,8785 Chunks
Type,float32,numpy.ndarray


In [35]:
import zarr

In [38]:
#ds_out = xr.open_zarr(temp_store)
zarr_out = zarr.open(temp_store)
print(zarr_out.tree())

/
 └── SSU (8785, 4320, 4320) float32


In [40]:
zarr_out['SSU'].info

0,1
Name,/SSU
Type,zarr.core.Array
Data type,float32
Shape,"(8785, 4320, 4320)"
Chunk shape,"(1, 4320, 4320)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,655796736000 (610.8G)


In [None]:
# loop around variables

for v in V:
    
    out_dir = work_data_dir+'rechunked/'  # /work/ALT/swot/aval/syn/rechunked/
    #out_dir = '/work/scratch/pontea/' # 2TB max
    overwrite = True

    print(' --- '+v)

    ds = load_data(v)

    # select common time line
    t0 = ds['time'].where(ds.iters==df['iter'][0],drop=True).values[0]
    t1 = ds['time'].where(ds.iters==df['iter'][-1],drop=True).values[0]
    ds = ds.sel(time=slice(t0,t1))

    ds['dtime'] = ds['dtime'].compute()
    ds['iters'] = ds['iters'].compute()

    face=1
    ds = ds.isel(face=face)

    Nt = len(ds.time)-1 if Nt0 == 0 else Nt0
    # -1 is to obtain 8784 which you can divide by 4**2

    # necessary ?
    ds = ds.isel(time=slice(len(ds.time)//Nt *Nt))
    
    # rechunker parameters
    max_mem = '30GB'
    target_store = out_dir+v+'_f{:02d}.zarr'.format(face)
    temp_store = out_dir+'tmp.zarr'

    # clean archives if necessary
    if os.path.isdir(target_store):
        if overwrite:
            rmtree(target_store)
        else:
            assert False, 'Archive exists and you do not want to overwrite'
    rmtree(temp_store, ignore_errors=True)

    # init rechunker
    target_chunks = get_chunks((Nt,)+Nc, v, 1)
    r = rechunk(ds, target_chunks, max_mem, target_store, temp_store=temp_store)    
    
    print_rechunk(r, v)
    
    # exec
    %time result = r.execute()
    
    # clean up intermediate file
    rmtree(temp_store, ignore_errors=True)
    

---

In [33]:
client.restart()

0,1
Client  Scheduler: tcp://10.135.39.21:38424  Dashboard: /user/pontea/proxy/40997/status,Cluster  Workers: 4  Cores: 8  Memory: 400.00 GB


In [None]:
cluster.close()