In [None]:
from copy import deepcopy

import matplotlib.pyplot as plt
import numpy as np
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import pdist, squareform

In [None]:
# MNIST Data
import hashlib
import os
import typing
from urllib.error import HTTPError, URLError
from urllib.request import urlretrieve


def download_mnist() -> str:
    """Code to download mnist originates from keras/datasets:

    https://github.com/keras-team/keras/blob/v2.15.0/keras/datasets/mnist.py#L25-L86
    """
    origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"
    path = _get_file(
        "mnist.npz",
        origin=origin_folder + "mnist.npz",
        file_hash=("731c5ac602752760c8e48fbffcf8c3b850d9dc2a2aedcf2cc48468fc17b673d1"),
    )

    return path


def _get_file(
    fname: str,
    origin: str,
    file_hash: typing.Optional[str] = None,
):
    cache_dir = os.path.join(os.path.expanduser("~"), ".keras")
    datadir_base = os.path.expanduser(cache_dir)
    if not os.access(datadir_base, os.W_OK):
        datadir_base = os.path.join("/tmp", ".keras")
    datadir = os.path.join(datadir_base, "datasets")
    os.makedirs(datadir, exist_ok=True)

    fname = os.fspath(fname) if isinstance(fname, os.PathLike) else fname
    fpath = os.path.join(datadir, fname)

    download = False
    if os.path.exists(fpath):
        if file_hash is not None and not _validate_file(fpath, file_hash):
            download = True
    else:
        download = True

    if download:
        try:
            error_msg = "URL fetch failure on {}: {} -- {}"
            try:
                urlretrieve(origin, fpath)
            except HTTPError as e:
                raise Exception(error_msg.format(origin, e.code, e.msg)) from e
            except URLError as e:
                raise Exception(error_msg.format(origin, e.errno, e.reason)) from e
        except (Exception, KeyboardInterrupt):
            if os.path.exists(fpath):
                os.remove(fpath)
            raise

        if (
            os.path.exists(fpath)
            and file_hash is not None
            and not _validate_file(fpath, file_hash)
        ):
            raise ValueError(
                "Incomplete or corrupted file detected. "
                f"The sha256 file hash does not match the provided value "
                f"of {file_hash}.",
            )
    return fpath


def _validate_file(fpath, file_hash, chunk_size=65535):
    hasher = hashlib.sha256()
    with open(fpath, "rb") as fpath_file:
        for chunk in iter(lambda: fpath_file.read(chunk_size), b""):
            hasher.update(chunk)

    return str(hasher.hexdigest()) == str(file_hash)


mnist_path = download_mnist()

In [None]:
# Create
rng = np.random.default_rng(33)
size = 12
path = download_mnist()
with np.load(path, allow_pickle=True) as fp:
    images, labels = fp["x_train"][:size], fp["y_train"][:size]

dup_images = deepcopy(images[:8]).astype("float64")
dup_images[:, :25, :25] = images[:8, 3:, 3:]
dup_images[:, 25:, 25:] = images[:8, :3, :3]

test_imgs = np.concatenate([images, dup_images])
test_imgs /= 255

rng.shuffle(test_imgs)

data = test_imgs.reshape((test_imgs.shape[0], -1))
print(data.shape)

In [None]:
r = np.random.randint(0, len(images))
plt.imshow(images[r])

### Functions


In [None]:
def create_clusters(arr):
    max_clusters = 1
    max_levels = 1
    clusters = {}
    for i in range(len(arr)):
        level = 1
        cluster_num = max_clusters
        distance = 0
        count = 0
        sample_added = []
        if arr[i, 0] in clusters:
            cluster_num = min([cluster_num, clusters[arr[i, 0]]["cluster_num"]])
            left_level = max([level, clusters[arr[i, 0]]["level"] + 1])
            distance += clusters[arr[i, 0]]["total_dist"]
            count += clusters[arr[i, 0]]["count"]
        else:
            sample_added.append(int(arr[i, 0]))

        if arr[i, 1] in clusters:
            cluster_num = min([cluster_num, clusters[arr[i, 1]]["cluster_num"]])
            right_level = max([level, clusters[arr[i, 1]]["level"] + 1])
            distance += clusters[arr[i, 1]]["total_dist"]
            count += clusters[arr[i, 1]]["count"]
        else:
            sample_added.append(int(arr[i, 1]))

        if arr[i, 0] in clusters and arr[i, 1] in clusters:
            if cluster_num == clusters[arr[i, 0]]["cluster_num"]:
                level = left_level
            elif cluster_num == clusters[arr[i, 1]]["cluster_num"]:
                level = right_level
        elif arr[i, 0] in clusters:
            level = left_level
        elif arr[i, 1] in clusters:
            level = right_level

        count += 1
        distance += arr[i, 2]

        clusters[arr[i, -1]] = {
            "cluster_num": cluster_num,
            "level": level,
            "total_dist": distance,
            "count": count,
            "avg_dist": distance / count,
            "samples_added": sample_added,
            "sample_dist": arr[i, 2],
        }

        if cluster_num == max_clusters and i < len(arr) - 1:
            max_clusters += 1

        if level > max_levels:
            max_levels = level

    return clusters

In [None]:
def create_clusters_opt(arr):
    max_clusters = 1
    max_levels = 1
    clusters = {}

    for i, arr_i in enumerate(arr):
        level = 1
        cluster_num = max_clusters

        distance = 0
        count = 0
        new_sample = []

        arr_0 = int(arr_i[0])
        arr_1 = int(arr_i[1])
        # print(arr_0, arr_1)

        # Cluster left
        cluster_0 = clusters.get(arr_0)
        if cluster_0 is None:
            new_sample.append(arr_0)
        else:
            cluster_num = min([cluster_num, cluster_0["cluster_num"]])
            left_level = max([level, cluster_0["level"] + 1])
            distance += cluster_0["total_dist"]
            count += cluster_0["count"]
        # Cluster right
        cluster_1 = clusters.get(arr_1)
        if cluster_1 is None:
            new_sample.append(arr_1)
        else:
            cluster_num = min([cluster_num, cluster_1["cluster_num"]])
            right_level = max([level, cluster_1["level"] + 1])
            distance += cluster_1["total_dist"]
            count += cluster_1["count"]

        if cluster_0 and cluster_1:
            if cluster_num == cluster_0["cluster_num"]:
                level = left_level
            elif cluster_num == cluster_1["cluster_num"]:
                level = right_level
        elif cluster_0:
            level = left_level
        elif cluster_1:
            level = right_level

        count += 1
        distance += arr_i[2]

        clusters[arr_i[-1]] = {
            "cluster_num": cluster_num,
            "level": level,
            "total_dist": distance,
            "count": count,
            "avg_dist": distance / count,
            "samples_added": new_sample,
            "sample_dist": arr_i[2],
        }

        if cluster_num == max_clusters and i < len(arr) - 1:
            max_clusters += 1

        max_levels = max(max_levels, level)

    return clusters, max_levels, max_clusters

In [None]:
def reorganize_clusters(clusters):
    """
    Reorganize the clusters dictionary to be nested by cluster_num, then by level,
    and include avg_dist, sample_dist, and samples within each level.

    Parameters:
    - clusters: A dictionary containing the original clusters information.

    Returns:
    - new_structure: A dictionary reorganized by cluster_num,
                    then by level, with details.
    """
    new_structure = {}
    for _, info in clusters.items():
        # Extract necessary information
        cluster_num = info["cluster_num"]
        level = info["level"]
        samples = info.get("samples_added", [])
        # Initialize the structure if not present
        if cluster_num not in new_structure:
            new_structure[cluster_num] = {}

        if level in new_structure[cluster_num]:
            continue

        if level == 1:
            new_structure[cluster_num][level] = {"samples": []}
        elif (level - 1) in new_structure[cluster_num]:
            sam = deepcopy(new_structure[cluster_num][level - 1]["samples"])
            new_structure[cluster_num][level] = {"samples": sam}

        # Extending the samples list.
        new_structure[cluster_num][level]["samples"].extend(samples)

    return new_structure

### Code


In [None]:
def sort_linkage(Z):
    """
    Sort the linkage matrix Z in reverse order by distance and
    then by cluster size (new_size).

    Parameters:
    - arr: linkage matrix

    Returns:
    - arr: Sorted linkage matrix
    """
    # Adjusting linkage matrix to accommodate renumbering
    arr = np.zeros((Z.shape[0], Z.shape[1] + 1))
    arr[:, :-1] = Z.copy()
    arr[:, -1] = np.arange(Z.shape[0] + 1, 2 * Z.shape[0] + 1)

    # Sort by decreasing distance, then by increasing new_size
    # arr = arr[arr[:, 2].argsort()[::-1]]
    # arr = arr[arr[:, -2].argsort(kind="stable")]

    return arr


distance_matrix = pdist(data, metric="euclidean")
Z = linkage(distance_matrix, method="single")

# Sort the linkage matrix
m_linkage = sort_linkage(Z)

In [None]:
sample_info, max_levels, max_clusters = create_clusters_opt(m_linkage)
max_levels, max_clusters

In [None]:
"""
The first row's last id is the cluster id that the linkage function uses in the rest of the linkage rows so columns 0 and 1. 
For example, 
the linkage function outputs 4 columns:
1. sample/cluster id
2. sample/cluster_id
3. distance between merging
4. total number of samples in the new cluster (it does not give the new cluster id, so I wrote a function to do that.

Now with my function here is an example assuming 100 samples:
row 0 - 23, 45, 1.78, 2, 100
row 5 - 36, 100, 1.95, 3, 105
"""
m_linkage

In [None]:
clusters = reorganize_clusters(sample_info)
clusters  # clusters : levels : samples

In [None]:
def get_duplicate(link_arr, distance):
    link_std = link_arr.std()
    # print("stdev:", link_std, " | distance:", distance)
    if distance <= link_std / 1e3:
        return "exact duplicate"
    if distance <= link_std:
        return "near duplicate"
    return ""


def get_outlier(level, distance, dist_arr):
    for i, d in enumerate(dist_arr[level]):
        if d == np.inf:
            break
    arr = dist_arr[level][:i]
    # dist_mean, dist_std = dist_arr[level].mean(), dist_arr[level].std()
    dist_mean, dist_std = arr.mean(), arr.std()
    # print("dist stdev:", dist_std * 2, " | distance:", abs(dist_mean - distance))
    if abs(dist_mean - distance) > dist_std * 2:
        return "outlier"
    if level >= dist_arr.shape[0] * 2 / 3:
        return "potential outlier"
    return ""

In [None]:
def get_distance(
    cluster: int,
    level: int,
    sample: int,
    distance_array: np.ndarray,
    distance_matrix: np.ndarray,
    clusters: dict,
) -> np.ndarray:
    # Convert the condensed distance matrix to a square form
    square_distance_matrix = squareform(distance_matrix)
    print("Distance array:", distance_array)

    # clusters -> cluster_id : level : samples_added

    for cluster_id, levels in clusters.items():
        # Only compare samples on the same level, but different clusters
        if cluster_id != cluster and level in levels:
            new_samples = levels[level]["samples"]

            # Get the minimum distance of all samples at each cluster
            min_dist = distance_array[cluster_id]
            # Should be optimized ->
            # If all distances in an array instead of split into dict values,
            # can take min over all instead of individually
            for sample2 in new_samples:
                sample_to_sample_dist = square_distance_matrix[sample, sample2]
                # print(f"Matrix [{sample},{sample2}] = {sample_to_sample_dist}")
                min_dist = min(min_dist, sample_to_sample_dist)
            # print(f"Min distance from cluster {cluster_id}: {min_dist}")
            # print()
            distance_array[cluster_id] = min_dist

    return distance_array

In [None]:
sample_tracking = {
    i: {
        "cluster": np.zeros(max_levels),
        # "distance": np.full((max_levels, max_clusters), np.inf),
        "duplicate": "",
        "outlier": "",
    }
    for i in range(len(m_linkage) + 1)
}

cluster_matrix = np.full((max_levels, max_clusters), np.inf)
# max levels: 51
# max clusters: 24

"""
For each sample, compare distance for each sample to sample
Ryan's code (with slight mods)
"""
print(f"Max levels: {max_levels}")
print(f"Max clusters: {max_clusters}")
# Merging the samples together by moving them up levels and clusters
for sample_id, info in sample_info.items():
    # Only computing for added samples
    if not info["samples_added"]:
        continue

    # The current sample info (origin info)
    added_samples = info["samples_added"]
    current_cluster = info["cluster_num"]
    current_level = info["level"]
    current_dist = info["sample_dist"]
    is_duplicate = get_duplicate(
        m_linkage[:, 2],
        current_dist,
    )

    print("CURRENT DIST", current_dist)
    print(f"Merging samples {added_samples} to sample {sample_id}")
    # Positions to "move" sample into
    merge_level = current_level - 1
    merge_cluster = current_cluster - 1

    # Check each added sample, update info
    for new_sample_id in added_samples:
        # Get new samples info
        new_sample_info = sample_tracking[new_sample_id]
        print(f"Level: {merge_level} | Cluster: {merge_cluster}")

        # As it moves up levels, track which cluster it was a part of at each level
        # Set the new sample's cluster at current level to origin sample's cluster
        new_sample_info["cluster"][merge_level] = current_cluster
        # print(new_sample_info["cluster"])

        # Set distance for new level and cluster to previous distance (to prevent distance compare becoming 0)
        cluster_matrix[merge_level, merge_cluster] = current_dist

        # At this level, get the distances from all samples
        # print(
        #     f"Distance at [{merge_level},{merge_cluster}]: {new_sample_info['distance'][merge_level, merge_cluster]}"
        # )

        # print(f"Dist at level {merge_level}:\n{dist_at_level}")
        updated_distance_at_level = get_distance(
            cluster=current_cluster,
            level=current_level,
            sample=new_sample_id,
            distance_array=cluster_matrix[merge_level],
            distance_matrix=distance_matrix,
            clusters=clusters,
        )
        cluster_matrix[merge_level] = updated_distance_at_level
        outlier_status = get_outlier(
            merge_level,
            info["sample_dist"],
            cluster_matrix,
        )
        print(f"Outlier: {outlier_status}")
        new_sample_info["outlier"] = outlier_status
    print("\n>>>>>   <<<<<\n")

# for s in sample_tracking.values():
#     print(s["cluster"])
# print(sample_tracking[0])
print(cluster_matrix)

In [None]:
"""
Refactored to use 1 cluster matrix, and a clusters per level dictionary
"""
cluster_matrix = np.full((max_levels, max_clusters), np.inf)
square_distance_matrix = squareform(distance_matrix)

print(cluster_matrix.shape)
print(square_distance_matrix.shape)

level_clusters = {}
for cluster, level_info in clusters.items():
    for lvl, samples_dict in level_info.items():
        if lvl not in level_clusters:
            level_clusters[lvl] = []
        level_clusters[lvl].append(samples_dict["samples"])
"""
key: level
value: list of lists
    -> list of clusters where clusters are lists of samples in that cluster
ex.
level 1:
              cluster_0               cluster_1
    [[sample1, sample2, sample3], [sample4, sample5]]
"""


def get_sample_avg_dist(sample_id, sample_info):
    """
    Queries the total sample info dictionary to find
    where the sample is first added to the clusters
    Uses the average distance at that cluster
    """
    # print("SAMPLE:", sample_id)
    for _, info in sample_info.items():
        samples = info["samples_added"]
        # print(samples)
        if sample_id in samples:
            return info["avg_dist"]
    else:
        raise IndexError("Sample not found in list")


# Get distance from each cluster to all others
# Take minimum distance of samples in a cluster
for level, clusters_of_samples in level_clusters.items():
    level = level - 1
    print(f"\t\tLevel {level}")
    if len(clusters_of_samples) <= 1:
        cluster_matrix[level, 0] = get_sample_avg_dist(
            clusters_of_samples[0][0], sample_info
        )
    for cid1, samples1 in enumerate(clusters_of_samples):
        print("\tMain cluster:", cid1)
        for cid2, samples2 in enumerate(clusters_of_samples[cid1 + 1 :]):
            cid2 = cid1 + cid2 + 1
            print("Second cluster:", cid2)
            print(samples1, samples2)

            for sample1 in samples1:
                for sample2 in samples2:
                    if sample1 == sample2:
                        continue
                    v = square_distance_matrix[sample1, sample2]
                    print(sample1, sample2, "=>", v)
                    # Because it is square, fill both (i,j) and (j,i) with minimum
                    cluster_matrix[level, cid1] = min(cluster_matrix[level, cid1], v)
                    cluster_matrix[level, cid2] = min(cluster_matrix[level, cid2], v)

sample_tracking = {
    i: {
        "cluster": np.zeros(max_levels),
        "duplicate": "",
        "outlier": "",
    }
    for i in range(len(m_linkage) + 1)
}

for cluster_id, info in sample_info.items():
    added_samples = info.get("samples_added")
    if not added_samples:
        continue

    added_samples = info["samples_added"]
    current_cluster = info["cluster_num"]
    current_level = info["level"]
    current_dist = info["sample_dist"]
    is_duplicate = get_duplicate(
        m_linkage[:, 2],
        current_dist,
    )

    merge_level = current_level - 1
    merge_cluster = current_cluster - 1

    for sample_id in added_samples:
        sample_info_to_update = sample_tracking[sample_id]
        sample_info_to_update["cluster"][merge_level] = current_cluster

        outlier_status = get_outlier(
            merge_level,
            current_dist,
            cluster_matrix,
        )
        sample_info_to_update["outlier"] = outlier_status

In [None]:
cluster_matrix

In [None]:
level_clusters

In [None]:
clusters

where does it enter a cluster -> potential outlier

never enters, how far from NN compared to how far clusters tend to be from other clusters


In [None]:
"""WIP"""


class Cluster:
    def __init__(
        self,
        cluster_id,
        cluster_num,
        level,
        total_dist,
        count,
        avg_dist,
        samples_added,
        sample_dist,
    ):
        self.cluster_id = cluster_id
        self.cluster_num = cluster_num
        self.level = level
        self.total_dist = total_dist
        self.count = count
        self.avg_dist = avg_dist
        self.samples_added = samples_added
        self.sample_dist = sample_dist