In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
usa_arrests_csv = "../data/USArrests.csv"

with open(usa_arrests_csv, "r") as usa_arrests_infile:
    usa_arrests_df = pd.read_csv(usa_arrests_infile)

In [3]:
usa_arrests_df.head()

Unnamed: 0,State,Murder,Assault,UrbanPop,Rape
0,Alabama,13.2,236,58,21.2
1,Alaska,10.0,263,48,44.5
2,Arizona,8.1,294,80,31.0
3,Arkansas,8.8,190,50,19.5
4,California,9.0,276,91,40.6


In [4]:
# Select the variables for clustering
X = usa_arrests_df[['Murder', 'Assault', 'UrbanPop', 'Rape']]

In [5]:
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
def euclidean_distance(p, q):
    return np.sqrt(np.sum((q - p) ** 2))

In [17]:
def min_similarity(cluster1, cluster2):
    min_distance = np.inf
    for point1 in cluster1:
        for point2 in cluster2:
            distance = euclidean_distance(point1, point2)
            if distance < min_distance:
                min_distance = distance
    return min_distance

In [18]:
def max_similarity(cluster1, cluster2):
    max_distance = 0
    for point1 in cluster1:
        for point2 in cluster2:
            distance = euclidean_distance(point1, point2)
            if distance > max_distance:
                max_distance = distance
    return max_distance

In [19]:
def average_similarity(cluster1, cluster2):
    total_distance = 0
    count = 0
    for point1 in cluster1:
        for point2 in cluster2:
            total_distance += euclidean_distance(point1, point2)
            count += 1
    return total_distance / count

In [25]:
def hierarchical_clustering(data, similarity_measure):
    clusters = [[i] for i in range(len(data))]  # Each data point is initially a cluster

    while len(clusters) > 1:
        min_distance = np.inf
        min_indices = None

        for i in range(len(clusters)):
            for j in range(i + 1, len(clusters)):
                distance = similarity_measure(data[clusters[i]], data[clusters[j]])
                if distance < min_distance:
                    min_distance = distance
                    min_indices = (i, j)

        i, j = min_indices
        new_cluster = clusters[i] + clusters[j]  # Merge clusters
        updated_clusters = [cluster for idx, cluster in enumerate(clusters) if idx not in (i, j)]
        updated_clusters.append(new_cluster)
        clusters = updated_clusters

        yield clusters

In [26]:
# Perform hierarchical clustering with Min similarity measure
min_clusters = list(hierarchical_clustering(X_scaled, min_similarity))

In [27]:
# Perform hierarchical clustering with Max similarity measure
max_clusters = list(hierarchical_clustering(X_scaled, max_similarity))

In [28]:
# Perform hierarchical clustering with Average similarity measure
average_clusters = list(hierarchical_clustering(X_scaled, average_similarity))

In [29]:
print("Min similarity measure:")
for i, clusters in enumerate(min_clusters):
    print(f"Iteration {i + 1}: Number of clusters = {len(clusters)}")

Min similarity measure:
Iteration 1: Number of clusters = 49
Iteration 2: Number of clusters = 48
Iteration 3: Number of clusters = 47
Iteration 4: Number of clusters = 46
Iteration 5: Number of clusters = 45
Iteration 6: Number of clusters = 44
Iteration 7: Number of clusters = 43
Iteration 8: Number of clusters = 42
Iteration 9: Number of clusters = 41
Iteration 10: Number of clusters = 40
Iteration 11: Number of clusters = 39
Iteration 12: Number of clusters = 38
Iteration 13: Number of clusters = 37
Iteration 14: Number of clusters = 36
Iteration 15: Number of clusters = 35
Iteration 16: Number of clusters = 34
Iteration 17: Number of clusters = 33
Iteration 18: Number of clusters = 32
Iteration 19: Number of clusters = 31
Iteration 20: Number of clusters = 30
Iteration 21: Number of clusters = 29
Iteration 22: Number of clusters = 28
Iteration 23: Number of clusters = 27
Iteration 24: Number of clusters = 26
Iteration 25: Number of clusters = 25
Iteration 26: Number of clusters = 

In [30]:
print("\nMax similarity measure:")
for i, clusters in enumerate(max_clusters):
    print(f"Iteration {i + 1}: Number of clusters = {len(clusters)}")


Max similarity measure:
Iteration 1: Number of clusters = 49
Iteration 2: Number of clusters = 48
Iteration 3: Number of clusters = 47
Iteration 4: Number of clusters = 46
Iteration 5: Number of clusters = 45
Iteration 6: Number of clusters = 44
Iteration 7: Number of clusters = 43
Iteration 8: Number of clusters = 42
Iteration 9: Number of clusters = 41
Iteration 10: Number of clusters = 40
Iteration 11: Number of clusters = 39
Iteration 12: Number of clusters = 38
Iteration 13: Number of clusters = 37
Iteration 14: Number of clusters = 36
Iteration 15: Number of clusters = 35
Iteration 16: Number of clusters = 34
Iteration 17: Number of clusters = 33
Iteration 18: Number of clusters = 32
Iteration 19: Number of clusters = 31
Iteration 20: Number of clusters = 30
Iteration 21: Number of clusters = 29
Iteration 22: Number of clusters = 28
Iteration 23: Number of clusters = 27
Iteration 24: Number of clusters = 26
Iteration 25: Number of clusters = 25
Iteration 26: Number of clusters =

In [31]:
print("\nAverage similarity measure:")
for i, clusters in enumerate(average_clusters):
    print(f"Iteration {i + 1}: Number of clusters = {len(clusters)}")


Average similarity measure:
Iteration 1: Number of clusters = 49
Iteration 2: Number of clusters = 48
Iteration 3: Number of clusters = 47
Iteration 4: Number of clusters = 46
Iteration 5: Number of clusters = 45
Iteration 6: Number of clusters = 44
Iteration 7: Number of clusters = 43
Iteration 8: Number of clusters = 42
Iteration 9: Number of clusters = 41
Iteration 10: Number of clusters = 40
Iteration 11: Number of clusters = 39
Iteration 12: Number of clusters = 38
Iteration 13: Number of clusters = 37
Iteration 14: Number of clusters = 36
Iteration 15: Number of clusters = 35
Iteration 16: Number of clusters = 34
Iteration 17: Number of clusters = 33
Iteration 18: Number of clusters = 32
Iteration 19: Number of clusters = 31
Iteration 20: Number of clusters = 30
Iteration 21: Number of clusters = 29
Iteration 22: Number of clusters = 28
Iteration 23: Number of clusters = 27
Iteration 24: Number of clusters = 26
Iteration 25: Number of clusters = 25
Iteration 26: Number of cluste