In [1]:
import os, sys
import numpy as np
import dask
from dask import delayed
import xarray as xr
from matplotlib import pyplot as plt
%matplotlib inline

from mitequinox.utils import *

In [5]:
from dask_jobqueue import PBSCluster
cluster = PBSCluster(cores=4)
w = cluster.scale(40)

In [6]:
from dask.distributed import Client
client = Client(cluster)

In [8]:
client

0,1
Client  Scheduler: tcp://10.135.36.114:45737  Dashboard: http://10.135.36.114:8787/status,Cluster  Workers: 40  Cores: 160  Memory: 1.60 TB


In [13]:
def compress(file, v, delete=False, verbose=False):
    
    filesp = file.split('/')
    file_out = '/'.join([*filesp[:-2],'compressed',filesp[-2],filesp[-1]])
    
    if not os.path.isfile(file_out) or ((os.path.getsize(file_out)>>20)<250):
        if verbose:
            print('Compress ',file_out)
        #
        df = xr.open_dataset(file)
        #
        #df.to_netcdf(file_out)
        # http://unidata.github.io/netcdf4-python/#section9
        df.to_netcdf(file_out, encoding={v:{'zlib': True, 'complevel': 2, 
                                            'chunksizes': [1, 4320, 4320]}})

        if delete:
            os.remove(file)
            
    return file_out


In [14]:
v = 'Eta'
v = 'SSU'
v = 'SSV'
v = 'SST'
v = 'SSS'
df = load_iters_date_files(v)

### parallelize conversion

In [10]:
I = range(len(df))
#I = range(10)
values = [delayed(compress)(df['file'].iloc[i], v, delete=False) for i in I]

In [11]:
futures = client.compute(values)
%time results = client.gather(futures)

CPU times: user 12min 23s, sys: 41.5 s, total: 13min 4s
Wall time: 55min


### tests, non parallelized

In [4]:
file = df['file'].iloc[0]
compress(file, v)

In [15]:
for file in df['file']:
    compress(file,v, verbose=True)

Compress  /work/ALT/swot/swotpub/LLC4320/compressed/SSS/SSS_t0000228816.nc


### fix files from zarr standard layout

In [41]:
ds = xr.open_zarr(work_data_dir+'rechunked/SSS_std.zarr').drop('dtime')
#for time in ds['time']:
for time, iters in zip(ds['time'].values,ds['iters'].values):
    file = root_data_dir+'netcdf/SSS/SSS_t%010d.nc'%iters
    #print(file)
    if not os.path.isfile(file) or ((os.path.getsize(file)>>20)<250):
        print(iters)
        file_out = root_data_dir+'netcdf/SSS/tmp/SSS_t%010d.nc'%iters
        ds.sel(time=time).to_netcdf(file_out, encoding={'SSS':{'zlib': True, 
                                                           'complevel': 2, 
                                                           'chunksizes': [1, 4320, 4320]}})

In [40]:
ds_good = xr.open_dataset(root_data_dir+'netcdf/SSS/SSS_t0001492992.nc')
ds = xr.open_dataset(root_data_dir+'netcdf/SSS/tmp/SSS_t0000232704.nc')
print(ds_good)
print(ds)

<xarray.Dataset>
Dimensions:  (face: 13, i: 4320, j: 4320)
Coordinates:
  * i        (i) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
  * j        (j) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
  * face     (face) int64 0 1 2 3 4 5 6 7 8 9 10 11 12
    time     float64 ...
    iters    int64 ...
Data variables:
    SSS      (face, j, i) float32 ...
<xarray.Dataset>
Dimensions:  (face: 13, i: 4320, j: 4320)
Coordinates:
  * face     (face) int64 0 1 2 3 4 5 6 7 8 9 10 11 12
  * i        (i) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
    iters    int64 ...
  * j        (j) int64 0 1 2 3 4 5 6 7 ... 4313 4314 4315 4316 4317 4318 4319
    time     float64 ...
Data variables:
    SSS      (face, j, i) float32 ...


# log SSS, corrupted files

232704
237168
240624
266688
274608
275040
284688
294192
306432
334656
345888
348480
349920
372528
373824
374832
398880
405936
412704
414144
434880
436464
442368
445536
456768
482688
502848
503712
507168
511920
525024
530352
561168
576432
581328
606384
610560
644688
659808
660096
684144
695232
701856
709776
732960
745632
749808
753408
756720
757728
767088
768240
772560
781488
786240
786672
794304
816048
821952
836784
837936
847152
857520
874944
899424
902304
908208
913392
948960
951840
966816
973872
995760
1003392
1031040
1032048
1042272
1044144
1049472
1054080
1062288
1132416
1150128
1154304
1160784
1175184
1180800
1184544
1184688
1191744
1193040
1205856
1214352
1218528
1220400
1228320
1276128
1279008
1287216
1290384
1293120
1294416
1326096
1350576
1351872
1360512
1378080
1418544
1423008
1426464
1435824
1446336
1455552
1475856
1488528
1490832

In [16]:
cluster.close()