In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import sys
import dask
import pandas as pd
import xarray as xr
import dask.array as da
import numpy as np
from re import split
from scipy.interpolate import griddata
from scipy.spatial import cKDTree as KDTree

from skimage.filters import gaussian, threshold_otsu
from skimage import measure
from dask_image.ndfilters import uniform_filter as uf
from dask_image.ndmeasure import variance as varian

from dask_jobqueue import SLURMCluster
from dask.distributed import Client, progress

sys.path.insert(1, f"{os.path.abspath(os.path.join(os.path.abspath(''), '../'))}")
from src.utils import get_pars_from_ini

location = split(', |_|-|!', os.popen('hostname').read())[0].replace("\n", "")
path_data = get_pars_from_ini(campaign='loc')[location]['path_data']
path_proj = get_pars_from_ini(campaign='loc')[location]['path_proj']


In [2]:
def get_col_row(x, size=30):
    ncols = x.ptp() / size
    return int(ncols)


def excluding_mesh(x, y, nx=30, ny=30):
    """
    Construct a grid of points, that are some distance away from points (x,
    """

    dx = x.ptp() / nx
    dy = y.ptp() / ny

    xp, yp = np.mgrid[x.min() - 2 * dx:x.max() + 2 * dx:(nx + 2) * 1j,
                      y.min() - 2 * dy:y.max() + 2 * dy:(ny + 2) * 1j]
    xp = xp.ravel()
    yp = yp.ravel()

    # Use KDTree to answer the question: "which point of set (x,y) is the
    # nearest neighbors of those in (xp, yp)"
    tree = KDTree(np.c_[x, y])
    dist, j = tree.query(np.c_[xp, yp], k=1)

    # Select points sufficiently far away
    m = (dist > np.hypot(dx, dy))
    return xp[m], yp[m]


def regridd(data, x, y, size=30):
    """
    data = xarray datarray
    size = desired pixel size in meters
    """
    if data.ndim > 2:
        x_n = np.rollaxis(x.reshape(-1, x.shape[-1]), 1)
        y_n = np.rollaxis(y.reshape(-1, y.shape[-1]), 1)
        z_s = data.compute()
        z_n = np.rollaxis(z_s.reshape(-1, z_s.shape[-1]), 1)
        idx_n = x_n.argsort(axis=-1)
        x_n = np.take_along_axis(x_n, idx_n, axis=-1)
        y_n = np.take_along_axis(y_n, idx_n, axis=-1)
        z_n = np.take_along_axis(z_n, idx_n, axis=-1)
        ncols_n = max(np.apply_along_axis(get_col_row, arr=x_n, axis=1))
        nrows_n = max(np.apply_along_axis(get_col_row, arr=y_n, axis=1))
        vp_n = [delayed(excluding_mesh)(x_n[i], y_n[i]) for i in range(x_n.shape[0])]
        vp_n = da.rollaxis(da.dstack(dask.compute(*vp_n)), -1)
        xp_n, yp_n = vp_n[:, 0], vp_n[:, 1]
        zp_n = [delayed(da.zeros_like)(xp_n[i]) for i in range(xp_n.shape[0])]
        zp_n = da.rollaxis(da.dstack(dask.compute(*zp_n))[0], -1)
        x_new_n = da.from_array(np.rollaxis(np.linspace(np.amin(x_n, -1), np.amax(x_n, -1), ncols_n), 1))
        y_new_n = da.from_array(np.rollaxis(np.linspace(np.amax(y_n, -1), np.amin(y_n, -1), nrows_n), 1))
        mesh = [delayed(da.meshgrid)(x_new_n[i], y_new_n[i]) for i in range(x_new_n.shape[0])]
        mesh = dask.compute(*mesh)
        xi = da.asarray(mesh)[:, 0]
        yi = da.asarray(mesh)[:, 1]
        z0 = [delayed(griddata)((np.r_[x_n[i, :], xp_n[i]], np.r_[y_n[i, :], yp_n[i]]), np.r_[z_n[i, :], zp_n[i]],
                                (xi[i], yi[i]), method='linear', fill_value=-9999)
              for i in range(xi.shape[0])]
        z0 = da.dstack(dask.compute(*z0))
        return z0
    else:
        x_s = x.flatten()
        y_s = y.flatten()
        z_s = data.compute().flatten()
        idx = x_s.argsort()
        x_s, y_s = np.take_along_axis(x_s, idx, axis=0), np.take_along_axis(y_s, idx, axis=0)
        z_s = np.take_along_axis(z_s, idx, axis=0)
        ncols = get_col_row(x=x_s, size=size)
        nrows = get_col_row(x=y_s, size=size)
        x_new = np.linspace(x_s.min(), x_s.max(), int(ncols))
        y_new = np.linspace(y_s.max(), y_s.min(), int(nrows))
        xi, yi = np.meshgrid(x_new, y_new)
        xp, yp = excluding_mesh(x_s, y_s, nx=35, ny=35)
        zp = np.nan + np.zeros_like(xp)
        z0 = griddata((np.r_[x_s, xp], np.r_[y_s, yp]), np.r_[z_s, zp], (xi, yi), method='linear', fill_value=-9999)
        return z0


def lee_filter_new(img, size, tresh=-150):
    if img.ndim == 2:
        shape = (size, size)
    else:
        shape = (size, size, 1)
    img = da.where(da.logical_or(da.isnan(img), da.equal(img, -9999)), tresh, img)
    img_mean = uf(img, shape)
    img_sqr_mean = uf(da.power(img, 2), shape)
    img_variance = img_sqr_mean - da.power(img_mean, 2)
    overall_variance = varian(img)
    img_weights = img_variance / (img_variance + overall_variance)
    img_output = img_mean + img_weights * (img - img_mean)
    img_output = da.where(img_output > 0, img_output, 0)
    return img_output


def process_new(zhh14, x, y, time):
    x = x[:, 0, :, :]
    img_filtered = lee_filter_new(zhh14, size=3, tresh=-200)
    img = np.where(img_filtered > 0., img_filtered, 0.)
    blurred = gaussian(img, sigma=0.8)
    binary = blurred > threshold_otsu(blurred)
    labels = measure.label(binary)
    if labels.ndim > 2:
        props = [measure.regionprops(labels[:, :, i]) for i in range(labels.shape[-1])]
        _props_all = [[[j.area for j in prop], [j.perimeter for j in prop], [j.major_axis_length for j in prop],
                       [j.minor_axis_length for j in prop], [j.bbox for j in prop]] for prop in props]
        df = pd.DataFrame(data=_props_all, columns=['area', 'perimeter', 'axmax', 'axmin', 'bbox'],
                          index=pd.to_datetime(time))
    else:
        props = measure.regionprops(labels)
        _props_all = [[[prop.area], [prop.perimeter], [prop.major_axis_length], [prop.minor_axis_length],
                       [prop.bbox]] for prop in props]
        df = pd.DataFrame(data=_props_all, columns=['area', 'perimeter', 'axmax', 'axmin', 'bbox'])

    df = df.explode(['area', 'perimeter', 'axmax', 'axmin'])
    df.to_csv(f'../results/all_{len(time)}.csv')
    df = df.astype(dtype={'area': 'float', 'perimeter': 'float', 'axmax': 'float', 'axmin': 'float'})
    df = df[df.area > 50.0]
    df_new = pd.DataFrame(index=time, data=np.full(len(time), np.nan), columns=['area'])
    df_new = df_new.merge(df, left_index=True, right_index=True, how='left').drop(['area_x'], axis=1)
    idx = df_new.index.duplicated()
    xr_prop = xr.Dataset.from_dataframe(df_new[~idx]).rename_dims({'index': 'time'}).rename({'index': 'time'})
    return xr_prop.area_y, xr_prop.perimeter, xr_prop.axmax, xr_prop.axmin


def ufunc_wrapper(data):
    x = data.range * data.DR * np.sin(np.deg2rad(data.azimuth)) # add roll
    y = data.alt3d * np.cos(np.deg2rad(data.azimuth))
    zhh = data.zhh14.where(data.alt3d > 500)
    _data = [zhh, x, y, data.time]
    icd = [list(i.dims) for i in _data]
    dfk = {'allow_rechunk': True, 'output_sizes': {}}
    a, p, mx, mn = xr.apply_ufunc(process_new,
                                  *_data,
                                  input_core_dims=icd,
                                  output_core_dims=[["time"], ["time"], ["time"], ["time"]],
                                  dask_gufunc_kwargs=dfk,
                                  dask='parallelized',
                                  vectorize=True,
                                  output_dtypes=[(float), (float), (float), (float)]
                                  )
    ds_out = a.to_dataset(name='area')
    ds_out['perimeter'] = p
    ds_out['ax_max'] = mx
    ds_out['ax_min'] = mn
    return ds_out


In [3]:
# client.close()
# cluster.close()

In [4]:
cluster = SLURMCluster(queue="seseml",
                       memory='200GB',
                       cores=40,
                       processes=1,
                       walltime='01:40:00',
                       scheduler_options={'host': '172.22.179.3:7222', 'dashboard_address': ':7778'})

In [5]:
cluster.scale(1)
cluster

Tab(children=(HTML(value='<div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-outpu…

In [7]:
%%bash
squeue -u alfonso8

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            501700    seseml dask-wor alfonso8  R       0:10      1 keeling-j01


In [8]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://172.22.179.3:7778/status,

0,1
Dashboard: http://172.22.179.3:7778/status,Workers: 1
Total threads: 40,Total memory: 186.26 GiB

0,1
Comm: tcp://172.22.179.3:7222,Workers: 1
Dashboard: http://172.22.179.3:7778/status,Total threads: 40
Started: Just now,Total memory: 186.26 GiB

0,1
Comm: tcp://172.22.179.101:34252,Total threads: 40
Dashboard: http://172.22.179.101:33449/status,Memory: 186.26 GiB
Nanny: tcp://172.22.179.101:43187,
Local directory: /data/keeling/a/alfonso8/temp/dask-worker-space/worker-9grjqyes,Local directory: /data/keeling/a/alfonso8/temp/dask-worker-space/worker-9grjqyes


In [9]:
%%time
ds_xr = xr.open_zarr(f'{path_data}/zarr_rckd/KUsKAs_Wn/lores.zarr')
# ds_xr = xr.open_zarr(f'{path_data}/zarr/KUsKAs_Wn/lores.zarr')
ds_xr = ds_xr.sel(time=~ds_xr.get_index("time").duplicated())



CPU times: user 832 ms, sys: 102 ms, total: 934 ms
Wall time: 2.76 s


In [10]:
%%time
# ds_data = ds_xr[['zhh14', 'azimuth', 'DR']].sel(time=slice('2019-09-16 03:12:50', '2019-09-16 04:14:05'))
# ds_data = ds_xr[['zhh14', 'azimuth', 'DR']].sel(time=slice('2019-09-16 03:12:50', '2019-09-16 05:13:05'))
ds_data = ds_xr[['zhh14', 'azimuth', 'DR']]

len(ds_data.time)

CPU times: user 0 ns, sys: 355 µs, total: 355 µs
Wall time: 361 µs


78800

In [11]:
a = ufunc_wrapper(ds_data)

In [12]:
%%time
w = dask.compute(a)

CPU times: user 19.1 s, sys: 17.1 s, total: 36.2 s
Wall time: 2min 57s


In [13]:
df = w[0].to_dataframe()

In [14]:
df

Unnamed: 0_level_0,area,perimeter,ax_max,ax_min
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-08-24 22:59:26.499800,,,,
2019-08-24 22:59:28.349800,,,,
2019-08-24 22:59:30.199800,,,,
2019-08-24 22:59:32.049800,,,,
2019-08-24 22:59:33.899800,,,,
...,...,...,...,...
2019-10-05 06:53:45.099800,,,,
2019-10-05 06:53:46.949800,,,,
2019-10-05 06:53:48.799800,,,,
2019-10-05 06:53:50.649800,,,,


In [15]:
df.dropna(how='any', inplace=True)

In [16]:
df

Unnamed: 0_level_0,area,perimeter,ax_max,ax_min
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-08-24 23:05:44.899800,374.0,109.905592,49.788748,11.469844
2019-08-24 23:05:46.749800,778.0,157.941125,70.974171,15.102534
2019-08-24 23:05:48.599800,1042.0,203.734019,96.953685,14.638842
2019-08-24 23:05:50.449800,1022.0,209.012193,95.645586,14.033519
2019-08-24 23:05:52.299800,1136.0,198.355339,91.270273,16.491947
...,...,...,...,...
2019-10-05 06:47:07.349800,623.0,131.734019,57.604125,14.402662
2019-10-05 06:47:09.199800,455.0,127.698485,53.579750,13.356701
2019-10-05 06:47:11.049800,282.0,118.213203,49.839266,9.312619
2019-10-05 06:47:12.899800,209.0,104.556349,43.583627,7.936367


In [17]:
df.describe()

Unnamed: 0,area,perimeter,ax_max,ax_min
count,11123.0,11123.0,11123.0,11123.0
mean,1263.591927,216.356385,91.48644,19.004221
std,1171.913127,125.586256,51.899467,8.979559
min,51.0,25.071068,9.04745,1.404458
25%,286.0,110.701533,48.997061,10.511701
50%,840.0,201.012193,84.396309,20.532468
75%,2026.0,312.065494,127.095823,27.781003
max,5083.0,759.918831,278.431998,45.735278


boundary box, max ref, mean (ku, ka), max ref (surf), 