# New Clustering Code 5/10/24


Code here is for testing the integrated clusterer with the original clusterer


In [3]:
from importlib import reload
from typing import Any, Dict, Optional

import numpy as np
import sklearn.datasets as dsets
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import pdist, squareform

import daml._internal.metrics.clustering as cl

In [4]:
def get_condensed_distance_array(data):
    return pdist(data, metric="euclidean")


def get_square_distance_matrix(condensed_distance_array):
    return squareform(condensed_distance_array)


def get_linkage_arr(condensed_distance_array):
    return linkage(condensed_distance_array, method="single")


def extend_linkage(link_arr):
    """
    Adds a column to the linkage matrix Z that tracks the new id assigned
    to each row

    Parameters
    ----------
    Z
        linkage matrix

    Returns
    -------
    arr
        linkage matrix with adjusted shape, new shape (Z.shape[0], Z.shape[1]+1)
    """
    # Adjusting linkage matrix to accommodate renumbering
    rows, cols = link_arr.shape
    arr = np.zeros((rows, cols + 1))
    arr[:, :-1] = link_arr
    arr[:, -1] = np.arange(rows + 1, 2 * rows + 1)

    return arr


def get_extended_linkage(condensed_distance_array):
    link_arr = get_linkage_arr(condensed_distance_array)
    return extend_linkage(link_arr)


def fill_missing_cluster_level(left_id, right_id, level, clusters):
    if left_id:
        left_level = left_id[0] + 1
        left_cluster = left_id[1]
        if level != left_level:
            for level_id in range(level - 1, left_level - 2, -1):
                if left_cluster not in clusters[level_id]:
                    clusters[level_id][left_cluster] = {
                        "cluster_merged": False,
                        "count": clusters[left_level - 1][left_cluster]["count"],
                        "avg_dist": clusters[left_level - 1][left_cluster]["avg_dist"],
                        "dist_std": clusters[left_level - 1][left_cluster]["dist_std"],
                        "samples": clusters[left_level - 1][left_cluster]["samples"],
                        "sample_dist": clusters[left_level - 1][left_cluster]["sample_dist"],
                        "outside_1-std": False,
                        "outside_2-std": False,
                    }
    if right_id:
        right_level = right_id[0] + 1
        right_cluster = right_id[1]
        if level != right_level:
            for level_id in range(level - 1, right_level - 2, -1):
                if right_cluster not in clusters[level_id]:
                    clusters[level_id][right_cluster] = {
                        "cluster_merged": False,
                        "count": clusters[right_level - 1][right_cluster]["count"],
                        "avg_dist": clusters[right_level - 1][right_cluster]["avg_dist"],
                        "dist_std": clusters[right_level - 1][right_cluster]["dist_std"],
                        "samples": clusters[right_level - 1][right_cluster]["samples"],
                        "sample_dist": clusters[right_level - 1][right_cluster]["sample_dist"],
                        "outside_1-std": False,
                        "outside_2-std": False,
                    }
    return clusters

In [5]:
class Clusterer:
    def __init__(self, dataset: np.ndarray, min_num_samples_per_cluster: Optional[int] = None):
        self.min_cluster_size: Optional[int] = min_num_samples_per_cluster
        self._on_init(dataset)

    def _on_init(self, x):
        self._data: np.ndarray = x
        self.num_samples = len(x)
        self.darr: np.ndarray = get_condensed_distance_array(x)
        self.sqdmat: np.ndarray = get_square_distance_matrix(self.darr)
        self.larr: np.ndarray = get_extended_linkage(self.darr)
        self.max_clusters: int = np.count_nonzero(self.larr[:, 3] == 2)
        self.last_merge_level: int = 1
        self.min_num_samples_per_cluster: int = (
            int(min(100, max(2, self.num_samples * 0.05))) if not self.min_cluster_size else self.min_cluster_size
        )

    @property
    def data(self):
        return self._data

    @data.setter
    def data(self, x: np.ndarray):
        self._on_init(x)

    def create_clusters(self) -> Dict[int, Any]:
        """Generates clusters based on linkage matrix

        Returns
        -------
        dict[str, Any]
            Cluster information
        """
        cluster_num = 0
        cluster_tracking = 0
        clusters = {}  # Dictionary to store clusters
        tracking = {}  # Dictionary to associate new cluster ids with actual clusters

        # Walking through the linkage array to generate clusters
        for arr_i in self.larr:
            level = 0

            left_count = 0
            right_count = 0
            merged = False

            arr_0 = int(arr_i[0])  # Grabbing the left id
            arr_1 = int(arr_i[1])  # Grabbing the right id
            dist = arr_i[2]  # Getting the distance between the left and right ids

            new_sample = []
            sample_dist = np.array([dist], dtype=np.float16)

            # Linkage matrix first column id
            left_id = tracking.get(arr_0)  # Determining if the id is already associated with a cluster
            if left_id is None:
                new_sample.append(arr_0)
            else:
                left_cluster = left_id[1]
                left_level = left_id[0] + 1
                left_count = clusters[left_id[0]][left_cluster]["count"]
                left_sample = clusters[left_id[0]][left_cluster]["samples"]
                sample_dist = np.concatenate([clusters[left_id[0]][left_cluster]["sample_dist"], sample_dist])

            # Linkage matrix second column id
            right_id = tracking.get(arr_1)  # Determining if the id is already associated with a cluster
            if right_id is None:
                new_sample.append(arr_1)
            else:
                right_cluster = right_id[1]
                right_level = right_id[0] + 1
                right_count = clusters[right_id[0]][right_cluster]["count"]
                right_sample = clusters[right_id[0]][right_cluster]["samples"]
                sample_dist = np.concatenate([clusters[right_id[0]][right_cluster]["sample_dist"], sample_dist])

            # Aggregate samples, determine cluster number, and get the level
            if left_id and right_id:
                if left_count > right_count:
                    samples = np.concatenate([left_sample, right_sample])
                else:
                    samples = np.concatenate([right_sample, left_sample])
                cluster_num = min([left_cluster, right_cluster])
                merged = max([left_cluster, right_cluster])
                level = max([left_level, right_level])
            elif left_id:
                samples = np.concatenate([left_sample, new_sample])
                cluster_num = left_cluster
                level = left_level
            elif right_id:
                samples = np.concatenate([right_sample, new_sample])
                cluster_num = right_cluster
                level = right_level
            else:
                samples = np.array(new_sample, dtype=np.int32)
                cluster_num = cluster_tracking

            dist_avg = np.mean(sample_dist)
            dist_std = np.std(sample_dist) if sample_dist.shape[0] > 1 else 1e-5

            out1 = dist_avg + dist_std
            out2 = out1 + dist_std

            # Initialize the structure if not present
            if level not in clusters:
                clusters[level] = {
                    cluster_num: {
                        "cluster_merged": merged,
                        "count": samples.shape[0],
                        "avg_dist": dist_avg,
                        "dist_std": dist_std,
                        "samples": samples,
                        "sample_dist": sample_dist,
                        "outside_1-std": dist > out1,
                        "outside_2-std": dist > out2,
                    }
                }
            else:
                clusters[level][cluster_num] = {
                    "cluster_merged": merged,
                    "count": samples.shape[0],
                    "avg_dist": dist_avg,
                    "dist_std": dist_std,
                    "samples": samples,
                    "sample_dist": sample_dist,
                    "outside_1-std": dist > out1,
                    "outside_2-std": dist > out2,
                }

            tracking[int(arr_i[-1])] = (level, cluster_num)  # Associates the new linkage id with the correct cluster

            if not left_id and not right_id:
                # Making sure that new clusters get unique numbers
                cluster_tracking += 1
            else:
                # Fill missing cluster levels for continuity.
                # Ensures all levels have consistent information across cluster changes.
                clusters = fill_missing_cluster_level(left_id, right_id, level, clusters)

            # Only tracking the levels in which clusters merge for the cluster distance matrix
            if merged:
                self.last_merge_level = max(self.last_merge_level, level + 1)

        return clusters

    def get_cluster_distances(self, clusters):
        # this is the cluster distance matrix
        cluster_matrix = np.full((self.last_merge_level, self.max_clusters, self.max_clusters), -1.0, dtype=np.float32)

        for level, cluster_set in clusters.items():
            if level < self.last_merge_level:
                cluster_ids = sorted(cluster_set.keys())
                for i, cluster_id in enumerate(cluster_ids):
                    cluster_matrix[level, cluster_id, cluster_id] = clusters[level][cluster_id]["avg_dist"]
                    for int_id in range(i + 1, len(cluster_ids)):
                        compare_id = cluster_ids[int_id]
                        sample_a = clusters[level][cluster_id]["samples"]
                        sample_b = clusters[level][compare_id]["samples"]
                        min_mat = self.sqdmat[np.ix_(sample_a, sample_b)].min()
                        cluster_matrix[level, cluster_id, compare_id] = min_mat
                        cluster_matrix[level, compare_id, cluster_id] = min_mat

        return cluster_matrix

    def get_merge_levels(self, clusters):
        """
        Runs through the clusters dictionary determining when clusters merge,
        and how close are those clusters when they merge.

        Parameters
        ----------
        clusters:
            A dictionary containing the original clusters information.

        Returns
        -------
        merge_clusters:
            A dictionary with each clusters merge history
        """

        merge_clusters = {"merge": {}, "likely_merge": {}, "no_merge": {}}

        for level, cluster_set in clusters.items():
            cluster_ids = sorted(cluster_set.keys())
            for i, cluster_id in enumerate(cluster_ids):
                # Extract necessary information
                samples = clusters[level][cluster_id]["samples"]
                merged = clusters[level][cluster_id]["cluster_merged"]
                out1 = clusters[level][cluster_id]["outside_1-std"]
                out2 = clusters[level][cluster_id]["outside_2-std"]

                if merged:
                    if out2:
                        if len(samples) < self.min_num_samples_per_cluster:
                            if cluster_id not in merge_clusters["likely_merge"]:
                                merge_clusters["likely_merge"][cluster_id] = {level: [merged, "low"]}
                            if level not in merge_clusters["likely_merge"][cluster_id]:
                                merge_clusters["likely_merge"][cluster_id][level] = [merged, "low"]
                        else:
                            if cluster_id not in merge_clusters["no_merge"]:
                                merge_clusters["no_merge"][cluster_id] = {level: [merged]}
                            if level not in merge_clusters["no_merge"][cluster_id]:
                                merge_clusters["no_merge"][cluster_id][level] = [merged]

                    elif out1 and len(samples) >= self.min_num_samples_per_cluster:
                        if cluster_id not in merge_clusters["likely_merge"]:
                            merge_clusters["likely_merge"][cluster_id] = {level: [merged]}
                        if level not in merge_clusters["likely_merge"][cluster_id]:
                            merge_clusters["likely_merge"][cluster_id][level] = [merged]

                    else:
                        if cluster_id not in merge_clusters["merge"]:
                            merge_clusters["merge"][cluster_id] = {level: [merged]}
                        if level not in merge_clusters["merge"][cluster_id]:
                            merge_clusters["merge"][cluster_id][level] = [merged]

        return merge_clusters

    def generate_merge_list(self, cluster_merges, cluster_matrix):
        merge_list, merge_mean, intra_max = self.cluster_merging(cluster_merges, cluster_matrix)
        desired_merge, merge = self.get_desired_merge(merge_mean, intra_max)

        j = 0
        for i, select in enumerate(desired_merge):
            if select:
                merge_list[i].append("merge")
            else:
                if merge[j]:
                    merge_list[i].append("merge")
                else:
                    merge_list[i].append("no-merge")
                j += 1

        merge_list = sorted(merge_list, reverse=True)
        return merge_list

    def cluster_merging(self, cluster_merges, cluster_matrix):
        intra_max = []
        merge_mean = []
        merge_list = []
        # Process each merge type
        for merge_type, merge_clusters in cluster_merges.items():
            for outer_cluster, inner_clusters in merge_clusters.items():
                for level, cluster_list in inner_clusters.items():
                    inner_cluster = cluster_list[0]

                    # Get the slice of the distance matrix up to the level before merging
                    distances = cluster_matrix[:level, outer_cluster, inner_cluster]
                    intra_distance = cluster_matrix[:, outer_cluster, outer_cluster]
                    mask = intra_distance >= 0
                    intra_filtered = intra_distance[mask]
                    intra_max.append(np.max(intra_filtered))

                    # Grabbing the corresponding desired values
                    if merge_type == "merge":
                        merge_mean.append(np.max(distances))
                    else:
                        merge_mean.append(np.mean(distances))

                    merge_list.append([level, outer_cluster, inner_cluster])

        return merge_list, merge_mean, intra_max

    def get_desired_merge(self, merge_mean, intra_max):
        intra_max = np.unique(intra_max)
        intra_value = np.log(intra_max)
        intra_value = intra_value.mean() + 2 * intra_value.std()
        merge_value = np.log(merge_mean)
        desired_merge = merge_value < intra_value

        check = merge_value[~desired_merge]
        check = np.abs((check - intra_value) / intra_value)
        mask = check < 1
        good = check[mask].mean() + check[mask].std()
        merge = check < good
        return desired_merge, merge

    def get_last_merge_levels(self, merge_list):
        last_good_merge_levels = {}
        for entry in merge_list:
            level, outer_cluster, inner_cluster, status = entry
            if status == "no-merge":
                if outer_cluster not in last_good_merge_levels:
                    last_good_merge_levels[outer_cluster] = 1
                if inner_cluster not in last_good_merge_levels:
                    last_good_merge_levels[inner_cluster] = 1
                if last_good_merge_levels[outer_cluster] > level:
                    last_good_merge_levels[outer_cluster] = level - 1
            else:
                if outer_cluster in last_good_merge_levels:
                    last_good_merge_levels[outer_cluster] = max(last_good_merge_levels[outer_cluster], level)
        return last_good_merge_levels

    def find_duplicates(self, dedup_std_list):
        diag_mask = np.ones(self.sqdmat.shape, dtype=bool)
        np.fill_diagonal(diag_mask, 0)
        diag_mask = np.triu(diag_mask)

        exact_mask = self.sqdmat < (np.mean(dedup_std_list) / 100)
        exact_indices = np.nonzero(exact_mask & diag_mask)
        exact_dedup = list(zip(exact_indices[0], exact_indices[1]))

        possible_mask = self.sqdmat < np.mean(dedup_std_list)
        possible_indices = np.nonzero(possible_mask & diag_mask & ~exact_mask)
        possible_dedup = list(zip(possible_indices[0], possible_indices[1]))

        return exact_dedup, possible_dedup

    def find_outliers(self, clusters, last_merge_levels):
        """
        The clusters dictionary contains whether the added sample/cluster
        was outside 1 standard deviation or outside 2 standard deviations.
        last_merge_levels contains the last good merge for each cluster we care about
        Using this information to determine when the sample was added to the cluster
        and how far it was from the cluster when it was added

        """
        outliers = []
        possible_outliers = []

        for level, cluster_set in clusters.items():
            cluster_ids = sorted(cluster_set.keys())
            for cluster_id in cluster_ids:
                # Extract necessary information
                samples = clusters[level][cluster_id]["samples"]
                merged = clusters[level][cluster_id]["cluster_merged"]
                out1 = clusters[level][cluster_id]["outside_1-std"]
                out2 = clusters[level][cluster_id]["outside_2-std"]

                if cluster_id in last_merge_levels and not merged:
                    if level > last_merge_levels[cluster_id] and out2:
                        outliers.append(samples[-1])
                    elif (
                        level > last_merge_levels[cluster_id]
                        and out1
                        and len(samples) >= self.min_num_samples_per_cluster
                    ):
                        possible_outliers.append(samples[-1])
        return outliers, possible_outliers

    def run(self):
        sample_info = self.create_clusters()

        if self.max_clusters > 1:
            cluster_matrix = self.get_cluster_distances(sample_info)
            merge_levels = self.get_merge_levels(sample_info)
            merge_list = self.generate_merge_list(merge_levels, cluster_matrix)
            last_merge_levels = self.get_last_merge_levels(merge_list)
        else:
            last_merge_levels = {0: int(max(self.num_samples * 0.1, self.min_num_samples_per_cluster))}

        outliers, potential_outliers = self.find_outliers(sample_info, last_merge_levels)

        dedup_std = []
        for cluster, level in last_merge_levels.items():
            level_cluster = sample_info[level][cluster]
            samples = level_cluster["samples"]
            if samples.shape[0] < self.min_num_samples_per_cluster:
                outliers.extend(samples.tolist())
            else:
                dedup_std.append(level_cluster["dist_std"])

        duplicates, near_duplicates = self.find_duplicates(dedup_std)

        ret = {
            "outliers": outliers,
            "potential_outliers": potential_outliers,
            "duplicates": duplicates,
            "near_duplicates": near_duplicates,
        }

        return ret

# Testing Data


In [6]:
plot_kwds = {"alpha": 0.5, "s": 50, "linewidths": 0}

# moons, _ = dsets.make_moons(n_samples=50, noise=0.1)
blobs, _ = dsets.make_blobs(  # type: ignore
    n_samples=100,
    centers=[(-1.5, 1.8), (-1, 3), (0.8, 2.1), (2.8, 1.5), (2.5, 3.5)],  # type: ignore
    cluster_std=0.3,
    random_state=33,
)
# test_data = np.vstack([moons, blobs])
test_data = blobs
test_data[79] = test_data[24]
test_data[63] = test_data[58] + 1e-5

# Test Functions


In [None]:
# Shaun's code for testing the clusterer
def test_outliers(x):
    assert len(x) == 6
    for val in x:
        assert val in [21, 6, 4, 71, 38, 11]
    print("Passed")


def test_potential_outliers(x):
    assert len(x) == 5
    for val in x:
        assert val in [42, 48, 9, 1, 43]
    print("Passed")


def test_duplicates(x):
    assert x == [(24, 79), (58, 63)]
    print("Passed")


def test_duplicates_new(x):
    """The new clusterer uses List of lists instead of List of sets"""
    assert x == [[24, 79], [58, 63]]
    print("Passed")


def test_near_duplicates(x):
    assert x == [
        (8, 27),
        (10, 65),
        (16, 99),
        (19, 64),
        (22, 87),
        (27, 29),
        (33, 76),
        (39, 55),
        (40, 72),
        (41, 62),
        (80, 81),
        (80, 93),
        (81, 93),
        (87, 95),
    ]
    print("Passed")


def test_near_duplicates_new(x):
    """The new clusterer groups overlapping indices"""
    assert x == [
        [8, 27, 29],
        [10, 65],
        [16, 99],
        [19, 64],
        [22, 87, 95],
        [33, 76],
        [39, 55],
        [40, 72],
        [41, 62],
        [80, 81, 93],
    ]
    print("Passed")


def run_tests_original(x):
    test_outliers(x["outliers"])
    test_potential_outliers(x["potential_outliers"])
    test_duplicates(x["duplicates"])
    test_near_duplicates(x["near_duplicates"])


def run_tests_new(x):
    test_outliers(x["outliers"])
    test_potential_outliers(x["potential_outliers"])
    test_duplicates_new(x["duplicates"])
    test_near_duplicates_new(x["near_duplicates"])

# Run Tests


In [None]:
reload(cl)

clusterer = Clusterer(test_data)
clusterer2 = cl.Clusterer(test_data)

x1 = clusterer.run()
x2 = clusterer2.evaluate()

# Test they give the same outcomes
run_tests_original(x1)
run_tests_new(x2)

Passed
Passed
Passed
Passed
Passed
Passed
Passed
Passed


### Timing Similar Processes

A process is determined by the IO for a function or group of functions which are shared between the two Clusterer classes


Since timeit does give outputs, some outputs must be pre-calculated


In [None]:
clusterer = Clusterer(test_data)
clusterer2 = cl.Clusterer(test_data)

# Clusterer pre-computed outputs
info = clusterer.create_clusters()
cluster_matrix = clusterer.get_cluster_distances(info)
merge_levels = clusterer.get_merge_levels(info)
merge_list = clusterer.generate_merge_list(merge_levels, cluster_matrix)
last_merge_levels = clusterer.get_last_merge_levels(merge_list)
dedup_std = []
for cluster, level in last_merge_levels.items():
    level_cluster = info[level][cluster]
    samples = level_cluster["samples"]
    if samples.shape[0] >= clusterer.min_num_samples_per_cluster:
        dedup_std.append(level_cluster["dist_std"])

# Integrated Clusterer pre-computed outputs
last_merge_levels2 = clusterer2.last_good_merge_levels

### Individual Timings


In [None]:
print(">>>>>\tCreate Clusters\t<<<<<")
%timeit clusterer.create_clusters()
%timeit clusterer2.clusters
print(">>>>>\tOriginal Clusterer Last Merge Levels\t<<<<<")
%timeit clusterer.get_cluster_distances(info)
%timeit clusterer.get_merge_levels(info)
%timeit clusterer.generate_merge_list(merge_levels, cluster_matrix)
%timeit clusterer.get_last_merge_levels(merge_list)
print(">>>>>\tIntegrated Clusterer Last Merge Levels\t<<<<<")
%timeit clusterer2._get_last_merge_levels()
print(">>>>>\tOutliers\t<<<<<")
%timeit clusterer.find_outliers(info, last_merge_levels)
%timeit clusterer2.find_outliers(last_merge_levels2)
print(">>>>>\tDuplicates\t<<<<<")
%timeit clusterer.find_duplicates(dedup_std)
%timeit clusterer2.find_duplicates(last_merge_levels)

>>>>>	Create Clusters	<<<<<
7 ms ± 229 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
864 ns ± 25.4 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
>>>>>	Original Clusterer Last Merge Levels	<<<<<
31.2 ms ± 2.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
162 µs ± 10.1 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
919 µs ± 21.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
13.2 µs ± 180 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
>>>>>	Integrated Clusterer Last Merge Levels	<<<<<
30.3 ms ± 888 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
>>>>>	Outliers	<<<<<
132 µs ± 2.48 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
57.9 µs ± 1.06 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
>>>>>	Duplicates	<<<<<
156 µs ± 2.58 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
233 µs ± 7.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### Timing Clusterer Init


In [10]:
# Test the __init__ times
%timeit Clusterer(test_data)
%timeit cl.Clusterer(test_data)

123 µs ± 2.25 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
122 µs ± 3.36 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


### Timing Full Clusterer Workflow


In [11]:
print("Original clusterer workflow")
%timeit Clusterer(test_data).run()
print("Integrated clusterer workflow")
%timeit cl.Clusterer(test_data).evaluate()

Original clusterer workflow
21.2 ms ± 1.73 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Integrated clusterer workflow
20.3 ms ± 750 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
