# Clustering

## Generate data

In [None]:
import matplotlib.pyplot as plt

from sklearn.cluster import kmeans_plusplus
from sklearn.datasets import make_blobs

# Generate sample data
n_samples = 4000
n_cluster = 4

X, y = make_blobs(
    n_samples=n_samples, centers=n_cluster, cluster_std=0.60, random_state=42
)

In [None]:
plt.scatter(X[:, 0], X[:, 1], marker="o", c=y, s=25, edgecolor="k")

In reality we don't have the label in a clustering problem, so what we can plot in reality is something like

In [None]:
plt.scatter(X[:, 0], X[:, 1], marker="o")

In reality the situation can be much more complex as we will see later.

## k-means with 2 Cluster

In [None]:
from sklearn.cluster import KMeans

# Fit with KMeans with 2 clusters


In [None]:
# Plot the data points color-labeled by the cluster they belong

# Plot the centers


## Evaluation Metric

### Extrinsic Measures

These measures require ground truth labels, which may not be available in practice

#### Rand Index

Rand index does find the similarity between two clustering by considering all the pairs of the n_sample but it ranges from 0 to 1. whereas ARI ranges from -1 to 1.



In [None]:
from sklearn.metrics import rand_score

# Apply rand_score to your results


#### Mutual Information

Mutual Information between two clusters is a measure of the similarity between two labels of the same data. That is it is used to check the mutual information in the actual label target vs the predicted model label.

In [None]:
from sklearn.metrics import mutual_info_score

# Apply mutual information to your results


### Intrinsic Measures

These measures do not require ground truth labels (applicable to all unsupervised learning results)

#### Silhouette Score aka Silhouette Coefficient
Silhouette score aka Silhouette Coefficient is an evaluation metric that results in the range of -1 to 1. A score near 1 signifies the best importance that the data point is very compact within the cluster to which it belongs and far away from the other clusters. The score near -1 signifies the least or worst importance of the data point. A score near 0 signifies overlapping clusters.

In [None]:
from sklearn.metrics import silhouette_score

# Apply silhouette_score to your results


#### Davies-Bouldin Index
Davies-Bouldin Index score is defined as the average similarity measure of each cluster with its most similar cluster, where similarity is the ratio of within-cluster distances to between-cluster distances. Thus, clusters that are farther apart and less dispersed will result in a better score. The minimum score is 0, with lower values indicating better clustering.

In [None]:
from sklearn.metrics import davies_bouldin_score

# Apply davies_bouldin_score to your results


### Elbow Method

You can easily run K-Means with several run for a range of clusters using a for loop and collecting the distortions into a list.

You can collect the distortions using the `inertia_`attribute. `inertia_`is the sum of squared distances of samples to their closest cluster center, weighted by the sample weights if provided.

Then plots the distortions of K-means.

In [None]:
# Create an elbow plot with the instructions above. What is the optimal k?



### Siluette Score

In [None]:
from sklearn.metrics import silhouette_score

# Create a silhouette score plot with the instructions above. What is the optimal k?


## k-means with 4 Cluster

In [None]:
# Repeat the fit with k = 4


In [None]:
# Plot the colored clusters and their centers


### Metric Comparison

In [None]:
# Compare the four metrics


## More Complex situation

In [None]:
import matplotlib.pyplot as plt

from sklearn.cluster import kmeans_plusplus
from sklearn.datasets import make_circles

# Generate sample data
n_samples = 4000
n_cluster = 4

# Make circles with NO NOISE
X, y = make_circles(
    n_samples=n_samples, random_state=42
)

plt.scatter(X[:, 0], X[:, 1], marker="o", c=y, s=25, edgecolor="k")

## k-means with 2 Cluster

In [None]:
# Fit kmeans with k = 2


In [None]:
# Plot the colored clusters and centers


## Spectral Clustering

In [None]:
from sklearn.cluster import SpectralClustering

# Fit with spectral clustering


In [None]:
# Plot the clusters


## Hierarchical Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

# Fit with Agglomerative Clustering


In [None]:
# Plot the clusters


### Plot Dendrogram

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram

from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets import load_iris


def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [None]:
# Fit with AgglomerativeClustering. Setting distance_threshold=0 ensures we compute the full tree.


In [None]:
# Plot the dendrogram
