# rewrite data under zarr format with standard layout

In [1]:
import os, sys
import numpy as np
import dask
import xarray as xr
from matplotlib import pyplot as plt
%matplotlib inline

from mitequinox.utils import *

In [2]:
from dask_jobqueue import PBSCluster
cluster = PBSCluster(cores=1)
#print(cluster.job_script())
w = cluster.scale(30)

In [3]:
# get dask handles and check dask server status
from dask.distributed import Client
client = Client(cluster)

In [8]:
client

0,1
Client  Scheduler: tcp://10.135.39.91:38266  Dashboard: http://10.135.39.91:8787/status,Cluster  Workers: 30  Cores: 30  Memory: 1.20 TB


### common time line and chunks across variables, standard layout

In [None]:
def load_enatl60_nc(v, suff='_t*', files=None, **kwargs):
    default_kwargs = {'concat_dim': 'time',
                      'compat': 'equals', 
                      'chunks': {'face':1, 'i': 480, 'j':480},
                      'parallel': True}
    if v is 'SSU':
        default_kwargs['chunks'] = {'face':1, 'i_g': 480, 'j':480}
    elif v is 'SSV':
        default_kwargs['chunks'] = {'face':1, 'i': 480, 'j_g':480}            
    default_kwargs.update(kwargs)
    #
    files_in = root_data_dir+'netcdf/'+v+'/'+v+suff
    if files is not None:
        files_in = files        
    ds = xr.open_mfdataset(files_in, 
                           **default_kwargs)
    ds = ds.assign_coords(dtime=xr.DataArray(iters_to_date(ds.iters.values), 
                                             coords=[ds.time], 
                                             dims=['time']))        
    return ds

In [9]:
V = ['SSU', 'SSV', 'SST', 'SSS']
#V = ['Eta']
V = ['SSS']
Nt0, Nc = 1, (None, None) # leads to files ??

df = load_common_timeline(V)

2011-11-15 00:00:00  to  2012-11-15 00:00:00


In [10]:
out_dir = root_data_dir+'zarr/'

for v in V:

    #ds0 = load_datanc(v, files=files, parallel=True)
    #files = df['file_'+v][:500].tolist()
    files = df['file_'+v][:].tolist()
    ds0 = load_data(v, ftype='nc', files=files, 
                    parallel=True, chunks={'face':1})
    
    Nt = len(ds0.time) if Nt0 == 0 else Nt0
        
    ds = ds0.isel(time=slice(len(ds0.time)//Nt *Nt))
    #
    chunks = {'time': Nt, 'i': Nc[0], 'j': Nc[1]}
    if v is 'SSU':
        chunks = {'time': Nt, 'i_g': Nc[0], 'j': Nc[1]}
    elif v is 'SSV':
        chunks = {'time': Nt, 'i': Nc[0], 'j_g': Nc[1]}
    ds = ds.chunk(chunks)
    #
    file_out = out_dir+'%s.zarr'%(v)
    #print(ds)
    if not os.path.isdir(file_out):
        try:
            %time ds.to_zarr(file_out, mode='w')
            pass
        except:
            print('Failure')
    dsize = getsize(file_out)
    print(' %s  data is %.1fGB ' %(v, dsize/1e9))


CPU times: user 17min 34s, sys: 1min 12s, total: 18min 47s
Wall time: 34min 59s
 SSS  data is 2624.2GB 


In [11]:
cluster.close()

In [12]:
ds = xr.open_zarr(file_out)
print(ds)

<xarray.Dataset>
Dimensions:  (face: 13, i: 4320, j: 4320, time: 8785)
Coordinates:
    dtime    (time) datetime64[ns] dask.array<shape=(8785,), chunksize=(8785,)>
  * face     (face) int64 0 1 2 3 4 5 6 7 8 9 10 11 12
  * i        (i) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
    iters    (time) int64 dask.array<shape=(8785,), chunksize=(1,)>
  * j        (j) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
  * time     (time) float64 5.702e+06 5.706e+06 5.71e+06 ... 3.732e+07 3.732e+07
Data variables:
    SSS      (time, face, j, i) float32 dask.array<shape=(8785, 13, 4320, 4320), chunksize=(1, 1, 4320, 4320)>


---

# grid under zarr format

In [10]:
grd = load_grdnc()
# should rechunk maybe
print(grd)
grd.to_zarr(root_data_dir+'grid.zarr', mode='w')

<xarray.Dataset>
Dimensions:  (face: 13, i: 4320, i_g: 4320, j: 4320, j_g: 4320, k: 90, k_l: 90, k_p1: 91, k_u: 90)
Coordinates:
  * k        (k) int64 0 1 2 3 4 5 6 7 8 9 10 ... 80 81 82 83 84 85 86 87 88 89
    Z        (k) float32 ...
  * i_g      (i_g) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
  * j        (j) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
  * face     (face) int64 0 1 2 3 4 5 6 7 8 9 10 11 12
    dyG      (face, j, i_g) float32 dask.array<shape=(13, 4320, 4320), chunksize=(1, 4320, 4320)>
  * i        (i) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
    XC       (face, j, i) float32 dask.array<shape=(13, 4320, 4320), chunksize=(1, 4320, 4320)>
  * j_g      (j_g) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
    YG       (face, j_g, i_g) float32 dask.array<shape=(13, 4320, 4320), chunksize=(1, 4320, 4320)>
  * k_u      (k_u) int64 0 1 2 3 4 5 6 7 8 9 ... 80 81 82 83 84 85 86 87 88 89
    Zu       (k_u)

<xarray.backends.zarr.ZarrStore at 0x2aec45d87d30>

---

In [17]:
w = cluster.scale_up(30)

In [None]:
client.restart()

In [10]:
cluster.close()