# rewrite data with larger chunks along time

In [1]:
import os, sys
import numpy as np
import dask
import xarray as xr
from matplotlib import pyplot as plt
%matplotlib inline

from mitequinox.utils import *

In [2]:
from dask_jobqueue import PBSCluster
cluster = PBSCluster(cores=1)
#print(cluster.job_script())
w = cluster.scale(30)

In [3]:
# get dask handles and check dask server status
from dask.distributed import Client
client = Client(cluster)

In [5]:
client

0,1
Client  Scheduler: tcp://10.135.39.91:42604  Dashboard: http://10.135.39.91:8787/status,Cluster  Workers: 30  Cores: 30  Memory: 1.20 TB


---
# try to load standard lay out, rechunk and store right away


In [6]:
# same but over all variables and faces
#V = ['SSU', 'SSV', 'Eta']
V = ['SST']

# parameters for rechunking: time and space
#Nt0, Nc = 24*10, (96, 48) # choice 0, leads to files <4MB
Nt0, Nc = 24*20, (288, 96) # choice 1, to be tested
#
# 96 x 45 = 4320
# other choices: 432, 27 (long scheduling), 288, 864

df = load_common_timeline(V)

# zarr compression
# http://xarray.pydata.org/en/stable/io.html
# http://zarr.readthedocs.io/en/stable/tutorial.html#compressors
#compressor = None

2011-11-15 00:00:00  to  2012-11-15 00:00:00


### one face at a time, all variables

**netcdf**: Goes through with 20 workers, 20 cores, 800GB, `chunks={'face':1, 'j':480}` and all  files.
Memory saturates at 415GB, i.e. there is probably NO spilling to tmp disk. Wall time= 25min/face

**zarr**: you need to explicetly delete the chunks encoding but otherwise works great

In [8]:
out_dir = work_data_dir+'rechunked/'
overwrite=False

for v in V:

    print('--- Start '+v)
    
    #files = load_iters_date_files(v).file[:500].tolist()
    #ds0 = load_datanc(v, files=files, parallel=True)
    #if v is 'SSV':
    #    ds0 = load_datanc(v,  parallel=True, chunks={'face':1, 'j_g':480})
    #else:
    #    ds0 = load_datanc(v, parallel=True, chunks={'face':1, 'j':480})            
    #print(ds0)
    ds0 = load_data(v) #.isel(time=slice(1000))   
    
    # select common time line
    t0 = ds0['time'].where(ds0.iters==df['iter'][0],drop=True).values[0]
    t1 = ds0['time'].where(ds0.iters==df['iter'][-1],drop=True).values[0]
    ds0 = ds0.sel(time=slice(t0,t1))
  
    Nt = len(ds0.time) if Nt0 == 0 else Nt0
    print(ds0)
    
    #for face in [1]:
    #for face in [1,2,10,11]:
    for face in range(ds0['face'].size):
    
        
        ds = ds0.isel(face=face)
        #
        ds = ds.isel(time=slice(len(ds.time)//Nt *Nt))
        #
        chunks = {'time': Nt, 'i': Nc[0], 'j': Nc[1]}
        if v is 'SSU':
            chunks = {'time': Nt, 'i_g': Nc[0], 'j': Nc[1]}
        elif v is 'SSV':
            chunks = {'time': Nt, 'i': Nc[0], 'j_g': Nc[1]}
        ds = ds.chunk(chunks)
        
        # tmp, xarray zarr backend bug: 
        # https://github.com/pydata/xarray/issues/2278
        del ds['iters'].encoding['chunks']
        del ds[v].encoding['chunks']
        
        file_out = out_dir+'%s_f%02d.zarr'%(v,face)
        #print(ds)
        if not os.path.isdir(file_out) or overwrite:
            %time ds.to_zarr(file_out, mode='w')
            
        dsize = getsize(file_out)
        print(' %s face=%d  data is %.1fGB ' %(v, face, dsize/1e9))


--- Start SSU
<xarray.Dataset>
Dimensions:  (face: 13, i_g: 4320, j: 4320, time: 8785)
Coordinates:
    dtime    (time) datetime64[ns] dask.array<shape=(8785,), chunksize=(8785,)>
  * face     (face) int64 0 1 2 3 4 5 6 7 8 9 10 11 12
  * i_g      (i_g) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
    iters    (time) int64 dask.array<shape=(8785,), chunksize=(1,)>
  * j        (j) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
  * time     (time) float64 5.702e+06 5.706e+06 5.71e+06 ... 3.732e+07 3.732e+07
Data variables:
    SSU      (time, face, j, i_g) float32 dask.array<shape=(8785, 13, 4320, 4320), chunksize=(1, 1, 4320, 4320)>
CPU times: user 4min 2s, sys: 10.7 s, total: 4min 13s
Wall time: 4min 16s
 SSU face=0  data is 189.2GB 
 SSU face=1  data is 414.0GB 
 SSU face=2  data is 265.8GB 
CPU times: user 3min 58s, sys: 11.2 s, total: 4min 9s
Wall time: 4min 15s
 SSU face=3  data is 128.8GB 
CPU times: user 4min 15s, sys: 12.1 s, total: 4min 27s
Wall t

In [9]:
cluster.close()

### rechunked all faces 

**netcdf**: Goes through with 20 workers, 20 cores, 800GB, `chunks={'face':1, 'j':480}` and **1000 files** (i.e not the full dataset!).
Memory saturates at 515GB, approx 70% of 800GB (560GB truly), i.e. there is spilling to tmp disk. Wall time= 40min

**zarr**: scheduler takes a while to start file writting with full dataset, I had to stop it

In [None]:
out_dir = work_data_dir+'rechunked/'

for v in V:

    #files = load_iters_date_files(v).file[:1000].tolist()
    #ds0 = load_datanc(v, files=files, parallel=True)
    #ds0 = load_datanc(v, parallel=True)
    #ds0 = load_datanc(v, files=files, parallel=True, 
    #                  chunks={'face':1, 'j':480})
    ds0 = load_data(v)
    
    # select common time line
    ds0 = ds0.sel(time=slice(df.index[0],df.index[-1])    
    
    Nt = len(ds0.time) if Nt0 == 0 else Nt0
    
    #
    ds = ds0.isel(time=slice(len(ds0.time)//Nt *Nt))
    #
    chunks = {'time': Nt, 'i': Nc[0], 'j': Nc[1]}
    if v is 'SSU':
        chunks = {'time': Nt, 'i_g': Nc[0], 'j': Nc[1]}
    elif v is 'SSV':
        chunks = {'time': Nt, 'i': Nc[0], 'j_g': Nc[1]}
    ds = ds.chunk(chunks)
    print(ds)

    # tmp, xarray zarr backend bug: 
    # https://github.com/pydata/xarray/issues/2278
    del ds['iters'].encoding['chunks']
    del ds[v].encoding['chunks']    
    
    file_out = out_dir+'%s_rechunked.zarr'%(v)
    %time ds.to_zarr(file_out, mode='w')

    dsize = getsize(file_out)
    print(' %s  data is %.1fGB ' %(v, dsize/1e9))


### common time line and chunks across variables, standard layout

In [6]:
#V = ['SSU', 'SSV', 'Eta']
V = ['SSS']
Nt0, Nc = 1, (None, None) # leads to files ??

df = load_common_timeline(V)

2011-11-15 00:00:00  to  2012-11-15 00:00:00


In [7]:
out_dir = work_data_dir+'rechunked/'

for v in V:

    #ds0 = load_datanc(v, files=files, parallel=True)
    #files = df['file_'+v][:500].tolist()
    files = df['file_'+v][:].tolist()
    ds0 = load_data(v, ftype='nc', files=files, parallel=True, chunks={'face':1})
    
    Nt = len(ds0.time) if Nt0 == 0 else Nt0
        
    ds = ds0.isel(time=slice(len(ds0.time)//Nt *Nt))
    #
    chunks = {'time': Nt, 'i': Nc[0], 'j': Nc[1]}
    if v is 'SSU':
        chunks = {'time': Nt, 'i_g': Nc[0], 'j': Nc[1]}
    elif v is 'SSV':
        chunks = {'time': Nt, 'i': Nc[0], 'j_g': Nc[1]}
    ds = ds.chunk(chunks)
    #
    file_out = out_dir+'%s_std.zarr'%(v)
    #print(ds)
    if not os.path.isdir(file_out):
        try:
            %time ds.to_zarr(file_out, mode='w')
            pass
        except:
            print('Failure')
    dsize = getsize(file_out)
    print(' %s  data is %.1fGB ' %(v, dsize/1e9))


CPU times: user 21min 10s, sys: 1min 22s, total: 22min 33s
Wall time: 41min 23s
 SSS  data is 2624.2GB 


In [8]:
cluster.close()

---

# grid under zarr format

In [10]:
grd = load_grdnc()
# should rechunk maybe
print(grd)
grd.to_zarr(root_data_dir+'grid.zarr', mode='w')

<xarray.Dataset>
Dimensions:  (face: 13, i: 4320, i_g: 4320, j: 4320, j_g: 4320, k: 90, k_l: 90, k_p1: 91, k_u: 90)
Coordinates:
  * k        (k) int64 0 1 2 3 4 5 6 7 8 9 10 ... 80 81 82 83 84 85 86 87 88 89
    Z        (k) float32 ...
  * i_g      (i_g) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
  * j        (j) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
  * face     (face) int64 0 1 2 3 4 5 6 7 8 9 10 11 12
    dyG      (face, j, i_g) float32 dask.array<shape=(13, 4320, 4320), chunksize=(1, 4320, 4320)>
  * i        (i) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
    XC       (face, j, i) float32 dask.array<shape=(13, 4320, 4320), chunksize=(1, 4320, 4320)>
  * j_g      (j_g) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
    YG       (face, j_g, i_g) float32 dask.array<shape=(13, 4320, 4320), chunksize=(1, 4320, 4320)>
  * k_u      (k_u) int64 0 1 2 3 4 5 6 7 8 9 ... 80 81 82 83 84 85 86 87 88 89
    Zu       (k_u)

<xarray.backends.zarr.ZarrStore at 0x2aec45d87d30>

---

In [17]:
w = cluster.scale_up(30)

In [None]:
client.restart()

In [10]:
cluster.close()