In [1]:
import os, sys
import numpy as np
import dask
import xarray as xr
from matplotlib import pyplot as plt
%matplotlib inline

from mitequinox.utils import *

In [2]:
from dask_jobqueue import PBSCluster
cluster = PBSCluster(cores=1)
#print(cluster.job_script())
w = cluster.scale(30)

In [3]:
# get dask handles and check dask server status
from dask.distributed import Client
client = Client(cluster)

In [4]:
client

0,1
Client  Scheduler: tcp://10.135.39.89:38758  Dashboard: http://10.135.39.89:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


---
# try to load standard lay out, rechunk and store right away


In [5]:
# same but over all variables and faces
#V = ['SSU', 'SSV']
#V = ['SSU']
V = ['Eta']

# parameters for rechunking
Nt0 = 24*10 # time chunks
#Nt = 0
#

#Nc = 96 # x 45 = 4320
Nc = (96, 48)
# other choices: 432, 27 (long scheduling), 288, 864

# zarr compression
# http://xarray.pydata.org/en/stable/io.html
# http://zarr.readthedocs.io/en/stable/tutorial.html#compressors
#compressor = None

### one face at a time, all variables

Goes through with 20 workers, 20 cores, 800GB, `chunks={'face':1, 'j':480}` and all files.
Memory saturates at 415GB, i.e. there is probably NO spilling to tmp disk. Wall time= 25min/face

In [7]:
out_dir = work_data_dir+'rechunked/'

for v in V:

    #files = load_iters_date_files(v).file[:500].tolist()
    #ds0 = load_datanc(v, files=files, parallel=True)
    if v is 'SSV':
        ds0 = load_datanc(v, parallel=True, chunks={'face':1, 'j_g':480})
    else:
        ds0 = load_datanc(v, parallel=True, chunks={'face':1, 'j':480})            
    print(ds0)
    
    Nt = len(ds0.time) if Nt0 == 0 else Nt0
    
    #for face in [1]:
    for face in range(ds0['face'].size):
        
        ds = ds0.isel(face=face)
        #
        ds = ds.isel(time=slice(len(ds.time)//Nt *Nt))
        #
        chunks = {'time': Nt, 'i': Nc[0], 'j': Nc[1]}
        if v is 'SSU':
            chunks = {'time': Nt, 'i_g': Nc[0], 'j': Nc[1]}
        elif v is 'SSV':
            chunks = {'time': Nt, 'i': Nc[0], 'j_g': Nc[1]}
        ds = ds.chunk(chunks)
        #
        # tmp, xarray zarr backend bug: 
        # https://github.com/pydata/xarray/issues/2278
        #del ds['face'].encoding['chunks']
        #del ds[v].encoding['chunks']
        
        file_out = out_dir+'%s_f%02d.zarr'%(v,face)
        #print(ds)
        if not os.path.isdir(file_out):
            try:
                %time ds.to_zarr(file_out, mode='w')
            except:
                print('Failure')
        dsize = getsize(file_out)
        print(' %s face=%d  data is %.1fGB ' %(v, face, dsize/1e9))


<xarray.Dataset>
Dimensions:  (face: 13, i: 4320, j: 4320, time: 10311)
Coordinates:
  * i        (i) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
  * j        (j) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
  * face     (face) int64 0 1 2 3 4 5 6 7 8 9 10 11 12
  * time     (time) float64 2.592e+05 2.628e+05 ... 3.737e+07 3.738e+07
    iters    (time) int64 10368 10512 10656 10800 ... 1494720 1494864 1495008
    dtime    (time) datetime64[ns] 2011-09-13 ... 2012-11-15T14:00:00
Data variables:
    Eta      (time, face, j, i) float32 dask.array<shape=(10311, 13, 4320, 4320), chunksize=(1, 1, 480, 4320)>
 Eta face=0  data is 192.1GB 
 Eta face=1  data is 410.8GB 
 Eta face=2  data is 263.6GB 
 Eta face=3  data is 130.0GB 
 Eta face=4  data is 461.8GB 
 Eta face=5  data is 88.1GB 
 Eta face=6  data is 386.3GB 
 Eta face=7  data is 395.6GB 




 Eta face=8  data is 494.8GB 




 Eta face=9  data is 250.8GB 
 Eta face=10  data is 259.7GB 




 Eta face=11  data is 408.5GB 




CPU times: user 22min 16s, sys: 1min 21s, total: 23min 38s
Wall time: 29min 18s
 Eta face=12  data is 231.9GB 


### all faces 

Goes through with 20 workers, 20 cores, 800GB, `chunks={'face':1, 'j':480}` and 1000 files.
Memory saturates at 515GB, approx 70% of 800GB (560GB truly), i.e. there is spilling to tmp disk. Wall time= 40min

In [9]:
out_dir = work_data_dir+'rechunked/'

for v in V:

    files = load_iters_date_files(v).file[:1000].tolist()
    #ds0 = load_datanc(v, files=files, parallel=True)
    #ds0 = load_datanc(v, parallel=True)
    ds0 = load_datanc(v, files=files, parallel=True, chunks={'face':1, 'j':480})
    
    Nt = len(ds.time) if Nt0 == 0 else Nt0
    
    #
    ds = ds0.isel(time=slice(len(ds0.time)//Nt *Nt))
    #
    chunks = {'time': Nt, 'i': Nc[0], 'j': Nc[1]}
    if v is 'SSU':
        chunks = {'time': Nt, 'i_g': Nc[0], 'j': Nc[1]}
    elif v is 'SSV':
        chunks = {'time': Nt, 'i': Nc[0], 'j_g': Nc[1]}
    ds = ds.chunk(chunks)

    file_out = out_dir+'%s.zarr'%(v)
    print(ds)
    try:
        %time ds.to_zarr(file_out, mode='w')
        # specify compression:
        #%time ds.to_zarr(file_out, mode='w', \
        #                 encoding={key: {'compressor': compressor} for key in ds.variables})
        # without compression: 601G for face 1
    except:
        print('Failure')
    dsize = getsize(file_out)
    print(' %s  data is %.1fGB ' %(v, dsize/1e9))


<xarray.Dataset>
Dimensions:  (face: 13, i_g: 4320, j: 4320, time: 960)
Coordinates:
  * i_g      (i_g) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
  * j        (j) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
  * face     (face) int64 0 1 2 3 4 5 6 7 8 9 10 11 12
  * time     (time) float64 5.702e+06 5.706e+06 5.71e+06 ... 9.151e+06 9.155e+06
    iters    (time) int64 dask.array<shape=(960,), chunksize=(240,)>
    dtime    (time) datetime64[ns] dask.array<shape=(960,), chunksize=(240,)>
Data variables:
    SSU      (time, face, j, i_g) float32 dask.array<shape=(960, 13, 4320, 4320), chunksize=(240, 1, 48, 96)>




CPU times: user 27min 40s, sys: 1min 55s, total: 29min 35s
Wall time: 40min 36s
 SSU  data is 447.3GB 


---

In [17]:
w = cluster.scale_up(30)

In [None]:
client.restart()

In [8]:
cluster.close()