In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
import math

path_data = os.path.join(".", "customers.csv")

import builtins


def print(*args, **kwargs):
    kwargs["sep"] = ""
    return builtins.print(*args, **kwargs)


plt.rcParams["figure.figsize"] = (6, 4)
plt.rcParams["font.family"] = "STIXGeneral"

### Lab Task

In [None]:
"""
Download your own CSV dataset from the internet e.g heatmap. Perform Hierarchical clustering of your dataset and showcase the plots.
"""

# Importing the dataset
df = pd.read_csv(path_data)
df.head()

In [None]:
le = preprocessing.LabelEncoder()
df["Gender"] = le.fit_transform(df["Gender"])
df.head()

In [None]:
plt.figure(1, figsize=(10, 6))
plt.title("Heatmap of the dataset")
sns.heatmap(df)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.title("Dendrograms")
dend = sch.dendrogram(sch.linkage(df, method="ward"))

### Agglomerative Clustering

In [None]:
def compute_distance(point1, point2):
    return math.sqrt(sum([(p1 - p2) ** 2 for p1, p2 in zip(point1, point2)]))


def min_link(cluster1, cluster2):
    return min([compute_distance(v1, v2) for v1 in cluster1 for v2 in cluster2])


def max_link(cluster1, cluster2):
    return max([compute_distance(v1, v2) for v1 in cluster1 for v2 in cluster2])


def avg_link(cluster1, cluster2):
    distances = [compute_distance(v1, v2) for v1 in cluster1 for v2 in cluster2]
    return sum(distances) / len(distances)


def select_distance_metric(metric):
    metrics = {0: min_link, 1: max_link, 2: avg_link}
    return metrics.get(metric, avg_link)


class HierarchicalClustering:
    def __init__(self, dataset, num_clusters, metric):
        self.dataset = dataset
        self.num_data_points = len(dataset)
        self.num_clusters = num_clusters
        self.distance_metric = select_distance_metric(metric)
        self.clusters = self.initialize_clusters()
        self.cluster_id_counter = len(self.dataset)

    def combine_clusters(self, cluster_i_id, cluster_j_id):
        combined_clusters = {
            self.cluster_id_counter: self.clusters[cluster_i_id]
            + self.clusters[cluster_j_id]
        }
        self.cluster_id_counter += 1

        for cluster_id in self.clusters.keys():
            if (cluster_id == cluster_i_id) or (cluster_id == cluster_j_id):
                continue
            combined_clusters[cluster_id] = self.clusters[cluster_id]
        return combined_clusters

    def display_clusters(self):
        clusters_str = ""
        for id, points in self.clusters.items():
            clusters_str += f"Cluster: {id}\n"
            for point in points:
                clusters_str += f"    {point}\n"
        return clusters_str

    def initialize_clusters(self):
        return {
            data_index: [data_point]
            for data_index, data_point in enumerate(self.dataset)
        }

    def identify_nearest_clusters(self):
        min_distance = math.inf
        nearest_clusters = None

        cluster_ids = list(self.clusters.keys())

        for i, cluster_i in enumerate(cluster_ids[:-1]):
            for j, cluster_j in enumerate(cluster_ids[i + 1 :]):
                distance = self.distance_metric(
                    self.clusters[cluster_i], self.clusters[cluster_j]
                )
                if distance < min_distance:
                    min_distance, nearest_clusters = distance, (cluster_i, cluster_j)
        return nearest_clusters

    def execute_clustering(self):
        while len(self.clusters.keys()) > self.num_clusters:
            nearest_clusters = self.identify_nearest_clusters()
            self.clusters = self.combine_clusters(*nearest_clusters)


N = len(df)
X = df.iloc[:, [3, 4]].values
hc = HierarchicalClustering(X, 5, 2)
hc.execute_clustering()

y_hc = hc.clusters

# fmt: off
# Visualising the clusters
colors = ["red", "blue", "green", "purple", "orange"]
for i, cluster_id in enumerate(hc.clusters.keys()):
    plt.scatter(X[np.array(hc.clusters[cluster_id]), 0],
                X[np.array(hc.clusters[cluster_id]), 1],
                c=colors[i % len(colors)])
plt.title("Clusters of Customers (Hierarchical Clustering Model)")
plt.xlabel("Annual Income")
plt.ylabel("Spending Score")
plt.show()
# fmt: on