In [None]:
import matplotlib.pyplot as plt
import numpy as np

## Example: obviously clustered 2D data

* https://jakevdp.github.io/PythonDataScienceHandbook/05.11-k-means.html

In [None]:
data1 = np.genfromtxt("data1.csv", delimiter=",")

plt.scatter(data1[:, 0], data1[:, 1])
plt.title("Clustered data")
plt.show()

### KMeans

* https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
* nb: algorith uses random sampling, can be different each time

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3)
kmeans.fit(data1)
clusters = kmeans.predict(data1)
print(clusters)

In [None]:
plt.scatter(data1[:, 0], data1[:, 1], c=clusters)
plt.title("Clusters")
plt.show()

In [None]:
plt.scatter(data1[:, 0], data1[:, 1], c=clusters)

centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c="black", s=200, alpha=0.5)
plt.title("Extract centres")
plt.show()

In [None]:
fig, axs = plt.subplots(2, 2, layout="constrained")

for i, ax in enumerate(axs.flatten()):
    num_clusters = i + 2
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(data1)
    clusters = kmeans.predict(data1)
    ax.scatter(data1[:, 0], data1[:, 1], c=clusters)
    ax.set_title(f"k={num_clusters} clusters")
plt.suptitle("Differing # clusters")
plt.show()

## Non-linear boundaries

In [None]:
data2 = np.genfromtxt("data2.csv", delimiter=",")

plt.scatter(data2[:, 0], data2[:, 1])
plt.show()

In [None]:
fig, axs = plt.subplots(2, 2, layout="constrained")

for i, ax in enumerate(axs.flatten()):
    num_clusters = i + 2
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(data2)
    clusters = kmeans.predict(data2)
    ax.scatter(data2[:, 0], data2[:, 1], c=clusters)
    ax.set_title(f"k={num_clusters} clusters")
plt.show()

## e.g.: Spectral Clustering
* https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html

In [None]:
from sklearn.cluster import SpectralClustering

model = SpectralClustering(n_clusters=3)
clusters = model.fit_predict(data2)
plt.scatter(data2[:, 0], data2[:, 1], c=clusters)
plt.show()

In [None]:
fig, axs = plt.subplots(2, 2, layout="constrained")

for i, ax in enumerate(axs.flatten()):
    num_clusters = i + 2
    model = SpectralClustering(n_clusters=num_clusters)
    clusters = model.fit_predict(data2)
    ax.scatter(data2[:, 0], data2[:, 1], c=clusters)
    ax.set_title(f"k={num_clusters} clusters")
plt.show()

### Overlapping clusters: obviously less certain

In [None]:
data3 = np.unique(np.concatenate((data1, data2), axis=0), axis=0)

plt.scatter(data3[:, 0], data3[:, 1])
plt.show()

In [None]:
fig, axss = plt.subplots(3, 2, layout="constrained")

assign_labels = ["kmeans", "discretize", "cluster_qr"]

for i, axs in enumerate(axss):
    for j, ax in enumerate(axs):
        num_clusters = j + 3
        model = SpectralClustering(
            n_clusters=num_clusters, assign_labels=assign_labels[i]
        )
        clusters = model.fit_predict(data3)
        ax.scatter(data3[:, 0], data3[:, 1], c=clusters)
        ax.set_title(f"k={num_clusters} clusters / {assign_labels[i]}")
plt.show()

--------------------

## "Canonical example" Fishers Iris data
* https://en.wikipedia.org/wiki/Iris_flower_data_set
* https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html
* Widely used/available example

In [None]:
from sklearn import datasets

iris = datasets.load_iris()

print(iris.target_names)
print(iris.feature_names)
print(iris.data.shape)

In [None]:
setosa, verisicolor, virginica = [iris.data[iris.target == i] for i in range(3)]

plt.scatter(setosa[:, 0], setosa[:, 2], label="setosa", marker="+")
plt.scatter(verisicolor[:, 0], verisicolor[:, 2], label="verisicolor", marker=".")
plt.scatter(virginica[:, 0], virginica[:, 2], label="virginica", marker="x")
plt.legend()
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[2])
plt.show()

In [None]:
fig, ax = plt.subplots()
model = SpectralClustering(n_clusters=3)

ax.set_title("K-means")
clusters = model.fit_predict(iris.data)
ax.scatter(iris.data[:, 0], iris.data[:, 2], c=clusters)
ax.set(xlabel=iris.feature_names[0], ylabel=iris.feature_names[3])
# plt.scatter(setosa[:, 0], setosa[:, 2], label="setosa", marker="+", c="k")
# plt.scatter(
#     verisicolor[:, 0], verisicolor[:, 2], label="verisicolor", marker=".", c="k"
# )
# plt.scatter(virginica[:, 0], virginica[:, 2], label="virginica", marker="x", c="k")
# plt.legend()
plt.show()

## Code for corner plot:

In [None]:
import scipy.stats as stats


# Prints a corner plot, with histograms along edges:
def corner_plot(data, labels, title, colours=None, legends=None):
    num_variables = len(labels)
    fig, axs = plt.subplots(nrows=num_variables, ncols=num_variables, figsize=(7, 7))
    for i in range(num_variables):
        for j in range(num_variables):

            # Only plot unique lower triangle
            if j > i:
                axs[i, j].set_visible(False)
                continue

            # Plot the data

            # Scatter plot for when x is not y
            if i != j:
                scatter = axs[i, j].scatter(data[:, j], data[:, i], c=colours)

            # When x=y the plot would just be a straight line
            # It's common to plot histgoram (or box plot, etc)
            else:
                axs[i, j].hist(data[:, i], density=True, alpha=0.4)
                x = np.linspace(min(data[:, i]), max(data[:, i]), 100)
                kde = stats.gaussian_kde(data[:, i])
                axs[i, j].plot(x, kde(x), "b")
                axs[i, j].fill_between(x, kde(x), alpha=0.6)
                axs[i, j].set_yticks([])

            # Add title (mean and standard deviation) above diagonal elements:
            if i == j:
                mean = np.mean(data[:, i])
                sd = np.std(data[:, i], ddof=1)
                axs[i, j].set_title(
                    f"{labels[j]}\n{mean:.1f}$\\pm${sd:.1f}", fontsize=8
                )

            # Add x labels only to last row
            if i == num_variables - 1:
                axs[i, j].set_xlabel(labels[j], fontsize=8)
            else:
                axs[i, j].set_xticks([])

            # Add y labels only to first column
            if j == 0:
                axs[i, j].set_ylabel(labels[i], fontsize=8)
            else:
                axs[i, j].set_yticks([])

    plt.suptitle(title)
    fig.align_ylabels(axs[:, 0])
    if legends != None:
        fig.legend(scatter.legend_elements()[0], legends, loc=(0.75, 0.75))
    return fig, axs

In [None]:
fig, axs = corner_plot(iris.data, iris.feature_names, "Fisher's Iris data")
plt.show()

In [None]:
fig, axs = corner_plot(
    iris.data,
    iris.feature_names,
    "Fisher's Iris data: Clusters",
    clusters,
    ["Cluster 1", "Cluster 2", "Cluster 3"],
)
plt.show()

fig, axs = corner_plot(
    iris.data,
    iris.feature_names,
    "Fisher's Iris data: Actual species",
    iris.target,
    iris.target_names.tolist(),
)
plt.show()