In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import sys
import dask
import pandas as pd
import xarray as xr
import dask.array as da
import numpy as np
from re import split

from skimage.filters import gaussian, threshold_otsu
from skimage import measure
from dask_image.ndfilters import uniform_filter as uf
from dask_image.ndmeasure import variance as varian

from dask_jobqueue import SLURMCluster
from dask.distributed import Client, progress

sys.path.insert(1, f"{os.path.abspath(os.path.join(os.path.abspath(''), '../'))}")
from src.utils import get_pars_from_ini

location = split(', |_|-|!', os.popen('hostname').read())[0].replace("\n", "")
path_data = get_pars_from_ini(campaign='loc')[location]['path_data']
path_proj = get_pars_from_ini(campaign='loc')[location]['path_proj']



In [2]:
def lee_filter_new(img, size, tresh=-150):
    img = da.where(da.logical_or(da.isnan(img), da.equal(img, -np.inf)), tresh, img)
    shape = (size, size, 1)
    img_mean = uf(img, shape)
    img_sqr_mean = uf(da.power(img, 2), shape)
    img_variance = img_sqr_mean - da.power(img_mean, 2)
    overall_variance = varian(img)
    img_weights = img_variance / (img_variance + overall_variance)
    img_output = img_mean + img_weights * (img - img_mean)
    return img_output


def process_new(zhh14, time):
    img = lee_filter_new(zhh14, size=3, tresh=-180)
    original = np.where(img > 0, zhh14, 0)
    blurred = gaussian(original, sigma=0.8, multichannel=True)
    binary = blurred > threshold_otsu(blurred)
    labels = measure.label(binary, connectivity=2)
    props = [measure.regionprops(labels[:, :, i]) for i in range(labels.shape[-1])]
    _props_all = [[[j.area for j in prop], [j.perimeter for j in prop], [j.major_axis_length for j in prop],
                   [j.minor_axis_length for j in prop]] for prop in props]
    df = pd.DataFrame(data=_props_all, columns=['area', 'perimeter', 'axmax', 'axmin'], index=pd.to_datetime(time))
    df = df.explode(['area', 'perimeter', 'axmax', 'axmin'])
    df.to_csv(f'../results/all_{len(time)}.csv')
    df = df.astype(dtype={'area': 'float', 'perimeter': 'float', 'axmax': 'float', 'axmin': 'float'})
    df = df[df.area > 50.0]
    df_new = pd.DataFrame(index=time, data=np.full(len(time), np.nan), columns=['area'])
    df_new = df_new.merge(df, left_index=True, right_index=True, how='left').drop(['area_x'], axis=1)
    idx = df_new.index.duplicated()
    # def_new1 = df_new[~idx]
    xr_prop = xr.Dataset.from_dataframe(df_new[~idx]).rename_dims({'index': 'time'}).rename({'index': 'time'})
    return xr_prop.area_y, xr_prop.perimeter, xr_prop.axmax, xr_prop.axmin


In [3]:
def ufunc_wrapper(data):
    _data = [data, data.time]
    icd = [list(i.dims) for i in _data]
    dfk = {'allow_rechunk': True, 'output_sizes': {}}
    a, p, mx, mn = xr.apply_ufunc(process_new,
                                  *_data,
                                  input_core_dims=icd,
                                  output_core_dims=[["time"], ["time"], ["time"], ["time"]],
                                  dask_gufunc_kwargs=dfk,
                                  dask='parallelized',
                                  vectorize=True,
                                  output_dtypes=[(float), (float), (float), (float)]
                                  )
    ds_out = a.to_dataset(name='area')
    ds_out['perimeter'] = p
    ds_out['ax_max'] = mx
    ds_out['ax_min'] = mn
    return ds_out

In [4]:
client.close()
cluster.close()

NameError: name 'client' is not defined

In [5]:
cluster = SLURMCluster(queue="seseml",
                       memory='200GB',
                       cores=40,
                       processes=1,
                       walltime='01:40:00',
                       scheduler_options={'host': '172.22.179.3:7222', 'dashboard_address': ':7778'})

In [6]:
cluster.scale(1)
cluster

Tab(children=(HTML(value='<div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-outpu…

In [7]:
%%bash
squeue -u alfonso8

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            498581    seseml dask-wor alfonso8  R       0:01      1 keeling-j02


In [8]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://172.22.179.3:7778/status,

0,1
Dashboard: http://172.22.179.3:7778/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://172.22.179.3:7222,Workers: 0
Dashboard: http://172.22.179.3:7778/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [9]:
%%time
ds_xr = xr.open_zarr(f'{path_data}/zarr_rckd/KUsKAs_Wn/lores.zarr')
# ds_xr = xr.open_zarr(f'{path_data}/zarr/KUsKAs_Wn/lores.zarr')
ds_xr = ds_xr.sel(time=~ds_xr.get_index("time").duplicated())



CPU times: user 719 ms, sys: 70.3 ms, total: 790 ms
Wall time: 812 ms


In [10]:
%%time
# ds_xr = ds_xr.sel(time=~ds_xr.get_index("time").duplicated())

CPU times: user 2 µs, sys: 2 µs, total: 4 µs
Wall time: 8.58 µs


In [11]:
%%time
ds_zhh = ds_xr.zhh14.isel(time=slice(2000, 6000))
ds_zhh = ds_zhh.sel(time=~ds_zhh.get_index("time").duplicated())

CPU times: user 14.9 ms, sys: 1.94 ms, total: 16.9 ms
Wall time: 14.8 ms


In [12]:
%%time
ds_zhh = ds_zhh.where(ds_xr.alt3d > 500)

CPU times: user 5.61 s, sys: 7.27 s, total: 12.9 s
Wall time: 19.3 s


In [13]:
a = ufunc_wrapper(ds_zhh)

In [14]:
%%time
w = dask.compute(a)

CPU times: user 535 ms, sys: 759 ms, total: 1.29 s
Wall time: 10.5 s


In [15]:
df = w[0].to_dataframe()

In [16]:
df

Unnamed: 0_level_0,area,perimeter,ax_max,ax_min
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-08-25 01:22:40.299800,,,,
2019-08-25 01:22:42.149800,,,,
2019-08-25 01:22:43.999800,,,,
2019-08-25 01:22:45.849800,,,,
2019-08-25 01:22:47.699800,,,,
...,...,...,...,...
2019-08-27 01:30:47.699800,,,,
2019-08-27 01:30:49.549800,,,,
2019-08-27 01:30:51.399800,,,,
2019-08-27 01:30:53.249800,,,,


In [17]:
df.dropna(how='any', inplace=True)

In [18]:
df

Unnamed: 0_level_0,area,perimeter,ax_max,ax_min
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-08-25 01:24:57.199800,57.0,28.863961,12.470805,6.060136
2019-08-25 01:24:59.049800,58.0,27.071068,10.369310,7.517993
2019-08-25 01:25:00.899800,66.0,33.071068,13.443727,7.062680
2019-08-25 01:25:02.749800,55.0,28.692388,11.431118,6.758988
2019-08-25 01:25:04.599800,55.0,28.485281,10.779396,7.295170
...,...,...,...,...
2019-08-27 01:02:37.199800,744.0,221.604076,77.266763,16.943994
2019-08-27 01:02:39.049800,136.0,95.142136,62.399940,10.994261
2019-08-27 01:02:55.699800,60.0,29.485281,10.319557,8.727483
2019-08-27 01:02:57.549800,70.0,36.142136,12.814503,8.264847


In [72]:
df.describe()

Unnamed: 0,area,perimeter,ax_max,ax_min
count,1638.0,1638.0,1638.0,1638.0
mean,1134.952381,192.886899,77.02012,19.022926
std,1087.790381,136.472684,53.918743,9.172841
min,51.0,25.828427,8.521565,2.678521
25%,67.0,34.374369,13.666223,8.758874
50%,785.5,195.358387,79.008364,19.949478
75%,2055.0,295.900018,116.767993,28.456116
max,3867.0,622.320851,276.315475,36.881002


boundary box, max ref, mean (ku, ka), max ref (surf), 