# Benchmarking the cluster performance of VoCC against DBSCAN on synthetic data
We used synthetic data that consits of vortices with noise along the movement vectors and background noise to assess the clustering performance of VoCC.
We compared how well vortices are detected by VoCC in comparison to DBSCAN by computing the precision and recall of clustered vs noise datapoints.

In [2]:
# IMPORT
%run ../scripts/experiment_utils.py

from matplotlib.cm import get_cmap
from matplotlib.patches import Circle
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.neighbors import KDTree
from VoCC import VortexCorrelationClustering

In [3]:
# Both benchmark functions for the generated data.
def benchmark_dbscan(coordinates):
    dbscan = DBSCAN(eps = 12, min_samples=12).fit(coordinates[:,:2])
    predicted_labels = dbscan.labels_
    L = coordinates[:, 4]

    return predicted_labels, calc_rates(predicted_labels, L)


def benchmark(coordinates, dimension_x, dimension_y, possible_radii):
    L = coordinates[:, 4]
    vocc = VortexCorrelationClustering(radii = possible_radii, resolution = 1, sectors = 15, circle_coverage_rate = .8, qth_threshold = .99, min_points = .01)
    predicted_labels = vocc.fit(coordinates[:, :4]).labels_

    return predicted_labels, calc_rates(predicted_labels, L)

In [11]:
# We used a set of fixed seeds to generate the synthethic data and cluster them with VoCC and DBSCAN.
# The resulting labels are checked against the ground truth to quantify the clustering performance. 
from tqdm import tqdm
precisions_vorcc, recalls_vorcc, f1scores_vorcc = [], [], []
precisions_db, recalls_db, f1scores_db = [], [], []
seeds = [2919, 7166, 3756, 5454, 3283, 4691, 6828, 7350, 1156, 3061, 7157,
       5919, 5893, 4445, 6436, 1292, 9752, 9175, 7902, 2067, 1979, 1736,
       1918,  700, 4046, 2073, 4661,  394, 3048, 3452,  522, 9754, 8741,
       1468, 6230, 8116, 9024, 6811, 1421, 7987,  285, 2745, 7108, 4464,
       1445, 5837, 1466, 9968, 7167, 9106, 8479, 4426, 5932, 9996, 1178,
       2551, 4370, 9780, 6322, 9967, 4862, 9179, 5855, 1203, 7349, 6386,
       1119, 8388, 3975, 8717, 9602, 1292, 9386, 3655, 4786, 2217, 6590,
       1017, 5176, 3590, 8239, 9019, 6176, 6647, 4722, 6015, 5122, 7683,
       2089, 2056, 9010, 3680, 4885, 3114, 7383, 1629, 1543, 7512, 5769,
       1904]

dim_x, dim_y, radii = 500, 300, np.arange(15,55, 5)
snr, n_vortices = .75, 17
for seed in tqdm(seeds):
    np.random.seed(seed)
    coordinates = create_benchmarking_dataset(dim_x, dim_y, n_vortices, 100, snr, radii, 0)
    predicted_labels, results = benchmark(coordinates, 500, 300, radii)
    precisions_vorcc.append(results['TP'] / (results['TP'] + results['FP']))
    recalls_vorcc.append(results['TP'] / (results['TP'] + results['FN']))
    f1scores_vorcc.append(2*results['TP'] / (results['TP'] + results['FN'] +  results['FP']))
    predicted_labels, results = benchmark_dbscan(coordinates)
    precisions_db.append(results['TP'] / (results['TP'] + results['FP']))
    recalls_db.append(results['TP'] / (results['TP'] + results['FN']))
    f1scores_db.append(2*results['TP'] / (results['TP'] + results['FN'] +  results['FP']))


print(f'VorCC precision {np.mean(precisions_vorcc) * 100:.2f}; recall {np.mean(recalls_vorcc) * 100:.2f}; f1score {np.mean(f1scores_vorcc)}')
print(f'DBSCAN precision {np.mean(precisions_db) * 100:.2f}; recall {np.mean(recalls_db) * 100:.2f}; f1score {np.mean(f1scores_db)}')

100%|██████████| 100/100 [44:11<00:00, 26.51s/it]

VorCC precision 98.05; recall 81.07; f1score 1.5953981853586796
DBSCAN precision 91.16; recall 81.88; f1score 1.515910306942827



