# Agglomerative Clustering

In [None]:
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## The Data

In [None]:
import pandas as pd

file_path = "/content/drive/My Drive/datasets/hclusters.csv"

df = pd.read_csv(file_path)


In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_europe,origin_japan,origin_usa
0,0,0.239362,1.0,0.617571,0.456522,0.53615,0.238095,0.0,0.0,0.0,1.0
1,1,0.159574,1.0,0.728682,0.646739,0.589736,0.208333,0.0,0.0,0.0,1.0
2,2,0.239362,1.0,0.645995,0.565217,0.51687,0.178571,0.0,0.0,0.0,1.0
3,3,0.18617,1.0,0.609819,0.565217,0.516019,0.238095,0.0,0.0,0.0,1.0
4,4,0.212766,1.0,0.604651,0.51087,0.520556,0.14881,0.0,0.0,0.0,1.0


In [None]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_europe,origin_japan,origin_usa
0,0.239362,1.0,0.617571,0.456522,0.53615,0.238095,0.0,0.0,0.0,1.0
1,0.159574,1.0,0.728682,0.646739,0.589736,0.208333,0.0,0.0,0.0,1.0
2,0.239362,1.0,0.645995,0.565217,0.51687,0.178571,0.0,0.0,0.0,1.0
3,0.18617,1.0,0.609819,0.565217,0.516019,0.238095,0.0,0.0,0.0,1.0
4,0.212766,1.0,0.604651,0.51087,0.520556,0.14881,0.0,0.0,0.0,1.0


## Using Scikit-Learn

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

**metricstr or callable, default=”euclidean”**
Metric used to compute the linkage. Can be “euclidean”, “l1”, “l2”, “manhattan”, “cosine”, or “precomputed”. If linkage is “ward”, only “euclidean” is accepted. If “precomputed”, a distance matrix is needed as input for the fit method. If connectivity is None, linkage is “single” and affinity is not “precomputed” any valid pairwise distance metric can be assigned.


**linkage{‘ward’, ‘complete’, ‘average’, ‘single’}, default=’ward’**

Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. The algorithm will merge the pairs of cluster that minimize this criterion.

‘ward’ minimizes the variance of the clusters being merged.

‘average’ uses the average of the distances of each observation of the two sets.

‘complete’ or ‘maximum’ linkage uses the maximum distances between all observations of the two sets.

‘single’ uses the minimum of the distances between all observations of the two sets.

In [None]:
!pip install --upgrade scikit-learn



In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import numpy as np

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

s = []
linkage_list = ["average", "single"]
metric_list = ["euclidean", "manhattan", "cosine"]
n_clusters_range = np.arange(2, 20)

best_score = -1
best_n = None
best_linkage = None
best_metric = None

for n_clusters in n_clusters_range:
    for metric in metric_list:
        for linkage in linkage_list:
            try:
                if linkage == "average":
                    model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage, metric=metric)
                else:
                    model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)

                labels = model.fit_predict(df_scaled)

                if len(set(labels)) > 1:
                    silhouette = silhouette_score(df_scaled, labels)
                    s.append(silhouette)

                    if silhouette > best_score:
                        best_score = silhouette
                        best_n = n_clusters
                        best_linkage = linkage
                        best_metric = metric if linkage == "average" else "euclidean"

            except Exception as e:
                print(f"Skipping n_clusters={n_clusters}, linkage={linkage}, metric={metric} due to error: {e}")

# Print the best model
print(f"Best Agglomerative Clustering Model:")
print(f"- Number of clusters: {best_n}")
print(f"- Linkage: {best_linkage}")
print(f"- Metric: {best_metric}")
print(f"- Best Silhouette Score: {best_score:.4f}")


Best Agglomerative Clustering Model:
- Number of clusters: 4
- Linkage: average
- Metric: euclidean
- Best Silhouette Score: 0.4417


In [None]:
model = AgglomerativeClustering(n_clusters=2, metric="euclidean", linkage = "average")

In [None]:
cluster_labels = model.fit_predict(df)

In [None]:
cluster_labels

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,

In [None]:
silhouette_score(df, cluster_labels)

0.47745879403803576

In [None]:
model = AgglomerativeClustering(n_clusters=2, metric="euclidean", linkage = "complete")
cluster_labels = model.fit_predict(df)

In [None]:
silhouette_score(df, cluster_labels)

0.47745879403803576