In [None]:
import os
import numpy as np
import dask
#from dask_jobqueue import PBSCluster
import xarray as xr
from matplotlib import pyplot as plt
%matplotlib inline

from mitequinox.utils import *

In [None]:
dmethod = 1
#
if dmethod == 1:
    from dask.distributed import Client
    scheduler = os.getenv('DATAWORK')+'/dask/scheduler.json'
    client = Client(scheduler_file=scheduler)
elif dmethod == 2:
    from dask_jobqueue import PBSCluster
    # folder where data is spilled when RAM is filled up
    local_dir = os.getenv('TMPDIR')
    #
    cluster = PBSCluster(queue='mpi_1', local_directory=local_dir, interface='ib0', walltime='24:00:00',
                         threads=14, processes=2, memory='50GB', resource_spec='select=1:ncpus=28:mem=100g', 
                         death_timeout=100)
    w = cluster.start_workers(40)

In [None]:
# you need to wait for workers to spin up
if dmethod == 2:
    cluster.scheduler

In [None]:
# get dask handles and check dask server status
if dmethod == 2:
    from dask.distributed import Client
    client = Client(cluster)

In [None]:
client

---

# automatic extraction of all variables

In [None]:
data_rdir = '/home/datawork-lops-osi/data/mit4320/'
grid_dir = data_rdir+'grid/'
out_dir = '/home1/datawork/aponte/iwsst_mit/'

def extract(v, t, face, i, j, pref=''):
    
    data_dir = data_rdir+v+'/'

    iters, time = get_iters_time(v, data_dir, delta_t=25.)
    
    p = 'C'
    if v is 'SSU':
        p = 'W'
    elif v is 'SSV':
        p = 'S'
    ds = get_compressed_data(v, data_dir, grid_dir, iters=iters, time=time, client=client, point=p)
    ds = ds.chunk({'face': 1})
    #print(ds)

    file_out = out_dir+pref+v+'.nc'
    ds[v].isel(time=t, face=face, i=i, j=j).to_netcdf(file_out, mode='w', unlimited_dims='time')


In [None]:
V = ['Eta', 'SST', 'SSU', 'SSV']
#V = ['Eta', 'SST']
#V = ['SSU']

ext = {}
ext['mad'] = {'time': slice(0,24), 'face': 1, 'i': slice(3000,None), 'j': slice(1000,4000)}
ext['nwa'] = {'time': slice(0,24), 'face': 4, 'i': slice(2000,4000), 'j': slice(1500,3500)}

for key, e in ext.items():
    print('face = %d' %e['face'])
    for v in V:
        print(v)
        extract(v, e['time'], e['face'], e['i'], e['j'], pref=key+'_')

---

# manual exploration in order to get index slices

In [None]:
data_dir = data_rdir+'Eta/'

iters, time = get_iters_time('Eta', data_dir, delta_t=25.)

ds = get_compressed_data('Eta', data_dir, grid_dir, iters=iters, time=time, client=client)
ds = ds.chunk({'face': 1})
print(ds)
print('\n data size: %.1f GB' %(ds['Eta'].nbytes / 1e9))
print('\n data size (1 face): %.1f GB' %(ds['Eta'].isel(face=1).nbytes / 1e9))

In [None]:
ds['Eta'].isel(time=0, face=1).plot()

In [None]:
ds['Eta'].isel(time=0, face=4).plot()

## produce a netcdf file

In [None]:
ds.isel(time=slice(0,24), face=1)

In [None]:
file_out = os.getenv('DATAWORK')+'/ssh_extract.nc'
#ds.isel(time=slice(0,24), face=1).to_netcdf(file_out, mode='w', unlimited_dims='time') # not working, weird incompatible chunk error
ds['Eta'].isel(time=slice(0,24), face=1).to_netcdf(file_out, mode='w', unlimited_dims='time')

---

# try to transpose data

In [None]:
Nt = 24*10 # time windows to consider
V = ['Eta', 'SST', 'SSS', 'SSU', 'SSV']
V = ['Eta']
out_dir = '/home1/datawork/aponte/mit_T/'

#
it = np.arange(time.size/Nt-1).astype(int)*Nt
assert it[-1]+Nt<time.size

for v in V:
    #
    data_dir = data_rdir+v+'/'
    iters, time = get_iters_time(v, data_dir, delta_t=25.)
    p = 'C'
    if v is 'SSU':
        p = 'W'
    elif v is 'SSV':
        p = 'S'
    ds = get_compressed_data(v, data_dir, grid_dir, iters=iters, time=time, client=client, point=p)
    ds = ds.chunk({'face': 1})
    #
    for face in ds['face']:
        for i, t in enumerate(it):
            dv = ds[v].isel(time=slice(t,t+Nt), face=face)
            dv = dv.drop(['XC','YC','Depth','rA'])
            dv = dv.transpose('i','j','time')
            #
            file_out = out_dir+'/%s_f%02d_t%02d.nc'%(v,face,i)
            %time eta.to_netcdf(file_out, mode='w', unlimited_dims=['time'])            
            #
            print('face=%d / i=%d'%(face,i))

## tests:

In [None]:
eta = ds['Eta'].isel(time=slice(0,24*10), face=1)
print('\n data size: %.1f GB' %(eta.nbytes / 1e9))
print(eta)
eta = eta.drop(['XC','YC','Depth','rA'])
print(eta)
eta = eta.transpose('i','j','time') # not lazy: 
print(eta)

In [None]:
file_out = os.getenv('DATAWORK')+'/ssh_T0.nc'
%time eta.to_netcdf(file_out, mode='w', unlimited_dims=['time'])

- [x] 24 time steps, fast

- [x] 24 time steps, slower 

```
eta.to_netcdf(file_out, mode='w', unlimited_dims='time')

netcdf ssh_T0 {
dimensions:
	t = UNLIMITED ; // (0 currently)
	i = UNLIMITED ; // (4320 currently)
	m = UNLIMITED ; // (0 currently)
	e = UNLIMITED ; // (0 currently)
	j = 4320 ;
	time = UNLIMITED ; // (24 currently)
variables:
	int64 i(i) ;
		i:standard_name = "x_grid_index" ;
		i:axis = "X" ;
		i:long_name = "x-dimension of the t grid" ;
		i:swap_dim = "XC" ;
	int64 j(j) ;
		j:standard_name = "y_grid_index" ;
		j:axis = "Y" ;
		j:long_name = "y-dimension of the t grid" ;
		j:swap_dim = "YC" ;
	int64 face ;
		face:standard_name = "face_index" ;
	double time(time) ;
		time:_FillValue = NaN ;
	float Eta(i, j, time) ;
		Eta:_FillValue = NaNf ;
		Eta:coordinates = "face" ;
```