# Set up code


In [1]:
import matplotlib.pyplot as plt
import numpy as np
import sklearn.datasets as dsets

from dataeval.detectors.linters import Clusterer as fc
from dataeval.utils._clusterer import sorted_union_find

In [None]:
small = np.load('./data/clusterable_data.npy')

In [None]:
# Create scatter plot
plt.figure(figsize=(20,20))
plot_kwds = {"alpha": 0.5, "s": 50, "linewidths": 0}
plt.scatter(small.T[0], small.T[1], **plot_kwds)
plt.show()

In [7]:
small, small_label = dsets.make_circles(
    n_samples=200000, factor=0.5, noise=0.05, random_state=30
)

In [None]:
# Create scatter plot
plt.figure(figsize=(20,20))
plot_kwds = {"alpha": 0.5, "s": 50, "linewidths": 0}
plt.scatter(small.T[0], small.T[1], **plot_kwds)

# Annotate each point in the scatter plot
# that = np.concatenate(edge_points)
# for i, (x, y) in enumerate(small[that,:2]):
#     plt.annotate(str(that[i]), (x, y), textcoords="offset points", xytext=(0, 1), ha="center")

plt.show()

In [None]:
blob, blob_label = dsets.make_blobs(  # type: ignore
    n_samples=48000,
    n_features=2,#048,
    centers=4,
    center_box=(-250,200),
    cluster_std=35,
    random_state=31,
)
blob2, blob_label2 = dsets.make_blobs(  # type: ignore
    n_samples=1500,
    n_features=2,#048,
    centers=1,
    center_box=(300,350),
    cluster_std=50,
    random_state=35,
)
blob3, blob_label3 = dsets.make_blobs(  # type: ignore
    n_samples=500,
    n_features=2,#048,
    centers=1,
    center_box=(-350,-400),
    cluster_std=25,
    random_state=33,
)
small = np.concatenate([blob,blob2,blob3])
small_label = np.concatenate([blob_label, blob_label2+4, blob_label3+5])

In [None]:
# Mapping from labels to colors
label_to_color = np.array(["b", "r", "g", "y", "m", 'c'])

# Translate labels to colors using vectorized operation
color_array = label_to_color[small_label]

# Additional parameters for plotting
plot_kwds = {"alpha": 0.5, "s": 50, "linewidths": 0}

plt.figure(figsize=(20,20))

# Create scatter plot
plt.scatter(small.T[0], small.T[1], c=color_array, **plot_kwds)

plt.show()

# Test code below


In [8]:
clustering = fc(small)

In [9]:
result = clustering.evaluate()

In [None]:
np.count_nonzero(result.clusters == -1)

In [None]:
%timeit -r 3 -n 1 fc(small)

In [None]:
%timeit -r 3 -n 1 clustering.evaluate()

In [None]:
# Mapping from labels to colors
label_to_color = np.array(["b", "r", "g", "y", "m", 'c', 'tab:orange', 'tab:purple'])

# Translate labels to colors using vectorized operation
color_array = label_to_color[result.clusters % len(label_to_color)]
color_array = np.where(result.clusters == -1, "k", color_array)

# Additional parameters for plotting
plot_kwds = {"alpha": 0.5, "s": 50, "linewidths": 0}

plt.figure(figsize=(20,20))

# Create scatter plot
plt.scatter(small.T[0], small.T[1], c=color_array, **plot_kwds)

plt.show()