In [None]:
import os
import sys
from functools import partial

import ase
import numpy as np
from ase.io import read, write
from matplotlib import pyplot as plt
import matplotlib as mpl
from openTSNE import TSNE
from rascal.representations import SphericalInvariants as SOAP
from skcosmo.preprocessing import StandardFlexibleScaler
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.preprocessing import normalize
import sklearn.cluster as cluster
import seaborn as sns
from tqdm.auto import tqdm
import hdbscan

from utils import set_mpl_fonts, set_cmap

set_mpl_fonts()
cmap = set_cmap()

### SOAP descriptors 

In [None]:
names = ['chair', 'twist-boat', 'boat', 'half-chair', 'planar']

# so that this notebook doesn't take forever 
n_skip = 5

# computed in `Dimensionality_Reduction_Cyclohexanes.ipynb`
data = np.load('./cyclohexane_data/soap_vectors.npz')
mean_soaps = data['mean_soaps'][::n_skip]
conf_mean_soaps = data['conf_mean_soaps']

mean_soaps.shape, conf_mean_soaps.shape

### Load Embeddings

In [None]:
# computed in `Dimensionality_Reduction_Cyclohexanes.ipynb`
pca_data = np.load('cyclohexane_data/pca.npz')
t_pca = pca_data['pca'][::n_skip]
t_pca_conf = pca_data['pca_conf']

tsne_data = np.load('cyclohexane_data/tsne.npz')
t_tsne = tsne_data['tsne'][::n_skip]
t_tsne_conf = tsne_data['tsne_conf']

umap_data = np.load('cyclohexane_data/umap.npz')
t_umap = umap_data['umap'][::n_skip]
t_umap_conf = umap_data['umap_conf']

pcovr_data = np.load('cyclohexane_data/pcovr.npz')
t_pcovr = pcovr_data['pcovr'][::n_skip]
t_pcovr_conf = pcovr_data['pcovr_conf']

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(12, 3))

for ax, rep, rep_name in zip(axes, [t_pca, t_tsne, t_umap, t_pcovr],
                             ['PCA', 't-SNE', 'UMAP', "PCovR"]):

    ax.scatter(
        rep[:, 0],
        rep[:, 1],
        s=1,
        cmap=cmap,
    )

    ax.set_title(rep_name, fontsize=12)
    ax.set_xticks([])
    ax.set_yticks([])

fig.subplots_adjust(top=0.8)
plt.suptitle('Mappings from `Dimensionality_Reduction_Cyclohexanes.ipynb`')
plt.show()

## Clustering

In [None]:
def plot_clusters(
    algorithms,
    algorithm_names=[''],
    reps=[mean_soaps, t_pca, t_tsne, t_umap, t_pcovr],
    rep_names=[
        'SOAP Vectors\n(Plotted along PC1&2)', 'PCA', 't-SNE', 'UMAP', "PCovR"
    ],
    algorithm_kw={},
):
    all_labels = {}

    if len(reps) > 1:
        fig, axes = plt.subplots(len(algorithms),
                                 len(reps),
                                 figsize=(len(reps) * 5, len(algorithms) * 5))
    elif len(algorithms) > 1:
        fig, axes = plt.subplots(1,
                                 len(algorithms),
                                 figsize=(len(algorithms) * 5, 5))
    else:
        fig, axes = plt.subplots(1, 1, figsize=(6, 8))
        axes = np.array([axes])

    axes = np.reshape(axes, (len(algorithms), len(reps)))

    for axl, clustering_algorithm, cam_name, kwds in zip(
            axes, algorithms, algorithm_names, algorithm_kw):
        for ax, rep, rep_name in zip(axl, reps, rep_names):
            alg = clustering_algorithm(**kwds)
            labels = alg.fit_predict(rep)
            np.maximum(labels, 0)

            all_labels[(rep_name, cam_name)] = labels

            nclusters = len(np.unique(labels))

            if nclusters < 256:
                if nclusters < 10:
                    cmap = plt.get_cmap('tab10')
                elif nclusters < 20:
                    cmap = plt.get_cmap('tab20')
                else:
                    cmap = plt.get_cmap('magma')
                norm = mpl.colors.BoundaryNorm(
                    np.arange(-0.5, nclusters + 0.5), cmap.N)
            else:
                cmap = plt.get_cmap('magma')
                norm = mpl.colors.BoundaryNorm(np.arange(0, 256), cmap.N)

            ax.scatter(
                rep[:, 0] if 'SOAP' not in rep_name else t_pca[:, 0],
                rep[:, 1] if 'SOAP' not in rep_name else t_pca[:, 1],
                s=2,
                c=labels,
                cmap=cmap,
                norm=norm,
                alpha=0.5,
            )

            plt.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap),
                         ax=ax,
                         orientation='horizontal',
                         ticks=range(0, nclusters)[::max(1, nclusters // 10)])

            if len(algorithms) == 1:
                ax.set_title(rep_name, fontsize=12)
                fig.suptitle(algorithm_names[0])

            elif len(reps) == 1:
                ax.set_title(cam_name, fontsize=12)
                fig.suptitle(rep_name)
            else:
                ax.set_title((rep_name, cam_name), fontsize=12)

            ax.set_xticks([])
            ax.set_yticks([])

    fig.subplots_adjust(top=0.8)
    return all_labels

### K-Means on all Embeddings

In [None]:
km_labels = plot_clusters([cluster.KMeans], ['KMeans'], algorithm_kw=[dict(n_clusters=5)])

In [None]:
temp_labels = plot_clusters(
    [cluster.AgglomerativeClustering, cluster.DBSCAN],
    ['Agglomerative Clustering', "DBSCAN"],
    reps=[t_pca],
    rep_names=["PCA"],
    algorithm_kw=[dict(n_clusters=5),
                  dict(eps=0.1, min_samples=2)])

In [None]:
silhouette_score(
    mean_soaps,
    temp_labels[('PCA',
                 'Agglomerative Clustering')].flatten()), silhouette_score(
                     mean_soaps, temp_labels[('PCA', 'DBSCAN')].flatten())

### Comparing Clusterings of the t-SNE Embedding

From here on, we'll just work with the t-SNE map, but you can swap it out to see other results!

For the sake of demonstration, we will also assume that a "ground truth" clustering based upon the HDBSCAN of the t-SNE

In [None]:
truth_labels = hdbscan.HDBSCAN(min_cluster_size=10, min_samples=3).fit_predict(t_tsne)

In [None]:
all_labels = plot_clusters([
    cluster.KMeans, cluster.AgglomerativeClustering, cluster.DBSCAN,
    hdbscan.HDBSCAN
], ["K-Means", 'Agglomerative Clustering', "DBSCAN", "HDBSCAN"],
                           algorithm_kw=[
                               dict(n_clusters=5),
                               dict(n_clusters=5),
                               dict(eps=3, min_samples=3),
                               dict(min_cluster_size=10, min_samples=3)
                           ],
                           reps=[t_tsne],
                           rep_names=['t-SNE'])

In [None]:
from sklearn.metrics import (davies_bouldin_score, silhouette_score,
                             rand_score, fowlkes_mallows_score, jaccard_score,
                             f1_score)
scorings = (davies_bouldin_score, silhouette_score,
                             rand_score, fowlkes_mallows_score, jaccard_score,
                             f1_score)
import pandas as pd

In [None]:
scores = np.array([[
    davies_bouldin_score(mean_soaps,
                         np.array(labels).flatten()),
    silhouette_score(mean_soaps,
                     np.array(labels).flatten()),
    rand_score(truth_labels,
               np.array(labels).flatten()),
    fowlkes_mallows_score(truth_labels,
                          np.array(labels).flatten()),
    jaccard_score(truth_labels, np.array(labels).flatten(), average='macro'),
    f1_score(truth_labels, np.array(labels).flatten(), average='macro'),
] for labels in all_labels.values()])

In [None]:
df = pd.DataFrame(scores,
                  columns=["DB", "Silhouette", "Rand", "FM", "Jaccard", "F"],
                  index=all_labels.keys())
df.style.format(precision=2)

From here, we can see that HDBSCAN, K-Means, and Agglomerative Clustering best capture the shape of the data (seen by the low Davies-Bouldin and higher silhouette scores). Because we are using HDBSCAN as our "ground truth", the external metrics are high (Rand, Fowlkes-Mallows, Jaccard, and F-scores), with DBSCAN coming in second place.