This is the **5th** Notebook in the clustering pipeline. It allows you to take the clusters from the randomly matched datasets and compute ensemble solutions for each algorithm, as well as all algorithms combined.

We use the ClusterEnsembles package by burtonrj on github. The repo can be found at <u>https://github.com/burtonrj/ClusterEnsembles/tree/main</u>.

Use <u>***pappas_tadam***</u> virtual environment.

In [None]:
# Set this to whatever directory GoodCopy is in

home_dir = "/home/l/lungboy/tadam/Project/"

# Importing Packages and Data

In [None]:
# Importing packages and functions

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys

sys.path.append(home_dir + 'GoodCopy/Functions')

import FunctionsOOPGood as func

In [None]:
# Importing DataSet object

data = func.DataSet(empty=True)
data.open_DataSet(home_dir + "GoodCopy/Objects/data_object")

In [None]:
# Importing MatchedDataSet object

data_matched = func.MatchedDataSet(empty=True)
data_matched.open_MatchedDataSet(home_dir + "GoodCopy/Objects/matched_data_saved")

In [None]:
# Importing Visualizations

vis = np.load(home_dir + "GoodCopy/Objects/UMAP_projections.npz")

UMAP_gower = vis["UMAP_gower"]
UMAP_biodata = vis["UMAP_biodata"]
UMAP_snf = vis["UMAP_snf"]
UMAP_onehot = vis["UMAP_onehot"]

In [None]:
# Importing labels

loaded_labels = np.load(home_dir + "GoodCopy/Objects/labels_matched.npz")

kmedoids_labels = loaded_labels["kmedoids_labels"]
hdb_labels = loaded_labels["hdb_labels"]
agglo_labels = loaded_labels["agglo_labels"]
snf_hdb_labels = loaded_labels["snf_hdb_labels"]
spec_labels = loaded_labels["spec_labels"]

# Ensemble Clustering Workflow

This is the workflow I used to combine matched labels using ensemble Clustering

In [None]:
# Importing packages

import ensembleclustering as CE
import itertools

In [None]:
# Kmedoids

# Iterating through each max number of labels from 2 to 9
for i in range(2, 10):
    # Generating ensemble labels for kmedoids
    kmedoids_clusters = CE.cluster_ensembles(np.asarray(kmedoids_labels), nclass=i)
    print("done kmed", i)
    pd.DataFrame(np.asarray(kmedoids_clusters)).to_csv(home_dir + "GoodCopy/EnsembleResults/{}_kmedoids_labels.csv".format(i))

In [None]:
# HDBSCAN using gower distance matrix

# Iterating through each max number of labels from 2 to 9
for i in range(2, 10):
    # Generating ensemble labels for HDBSCAN
    hdb_clusters = CE.cluster_ensembles(np.asarray(hdb_labels), nclass=i)
    print("done hdb", i)
    pd.DataFrame(np.asarray(hdb_clusters)).to_csv(home_dir + "GoodCopy/EnsembleResults/{}_hdb_labels.csv".format(i))

In [None]:
# Agglomerative Clustering

# Iterating through each max number of labels from 2 to 9
for i in range(2, 10):
    # Generating ensemble labels for agglomerative clustering
    agglo_clusters = CE.cluster_ensembles(np.asarray(agglo_labels), nclass=i)
    print("done agglo", i)
    pd.DataFrame(np.asarray(agglo_clusters)).to_csv(home_dir + "GoodCopy/EnsembleResults/{}_agglo_labels.csv".format(i))

In [None]:
# Kmedoids + HDBSCAN (gower) + Agglomerative Clustering

# Iterating through each max number of labels from 2 to 9
for i in range(2, 10):
    # Combining all non-snf labels
    all_labels = list(itertools.chain(kmedoids_labels, agglo_labels, hdb_labels))

    # Generating ensemble labels from all non-snf clusters
    all_clusters = CE.cluster_ensembles(np.asarray(all_labels), nclass=i)
    print("done all", i)
    pd.DataFrame(np.asarray(all_clusters)).to_csv(home_dir + "GoodCopy/EnsembleResults/{}_all_labels.csv".format(i))

In [None]:
# HDBSCAN using SNF distance matrix

# Iterating through each max number of labels from 2 to 9
for i in range(2, 10):
    # Generating ensemble labels for HDBSCAN snf clusters
    snf_hdb_clusters = CE.cluster_ensembles(np.asarray(snf_hdb_labels), nclass=i)
    print("done SNF HDBSCAN", i)
    pd.DataFrame(np.asarray(snf_hdb_clusters)).to_csv(home_dir + "GoodCopy/EnsembleResults/{}_snf_hdb_labels.csv".format(i))

In [None]:
# Spectral Clustering with SNF

# Iterating through each max number of labels from 2 to 9
for i in range(2,10):  
    # Generating ensemble labels for Spectral snf clusters
    snf_spectral_clusters = CE.cluster_ensembles(np.asarray(spec_labels), nclass=i)
    print("done Spectral", i)
    pd.DataFrame(np.asarray(snf_spectral_clusters)).to_csv(home_dir + "GoodCopy/EnsembleResults/{}_snf_spectral_labels.csv".format(i))

In [None]:
# Kmedoids + HDBSCAN (gower) + Agglomerative Clustering + HDBSCAN (SNF) + Spectral Clustering

# Iterating through each max number of labels from 2 to 9
for i in range(2,10):
    # Combining all labels, including snf
    all_plus_snf_labels = list(itertools.chain(kmedoids_labels, agglo_labels, hdb_labels, snf_hdb_labels, spec_labels))

    # Generating ensemble labels for all clusters
    all_plus_snf_clusters = CE.cluster_ensembles(np.asarray(all_plus_snf_labels), nclass=i)
    print("done all plus", i)
    pd.DataFrame(np.asarray(all_plus_snf_clusters)).to_csv(home_dir + "GoodCopy/EnsembleResults/{}_all_plus_snf_labels.csv".format(i))