This is the **4th** Notebook in the clustering pipeline. It allows you to take the MatchedDataSet object and cluster on all the matched datasets with KMedoids, HDBSCAN using gower-daisy distance matrix, Agglomerative Clustering, HDBSCAN using SNF (of risk_factor and biomarker data) distance matrix, and Spectral clustering using SNF (of risk_factor and biomarker data) affinity matrix.

Note that to combat the stochastic effect of some clustering algorithms, I have implemented a seeding into all functions. The base seed is 42.

Use <u>***pappas_tadam***</u> virtual environment.

In [None]:
# Set this to whatever directory GoodCopy is in

home_dir = "/home/l/lungboy/tadam/Project/"

# Importing Packages and Data

In [None]:
# Importing packages and functions

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys

sys.path.append(home_dir + 'GoodCopy/Functions')

import FunctionsOOPGood as func

In [None]:
# Importing DataSet object

data = func.DataSet(empty=True)
data.open_DataSet(home_dir + "GoodCopy/Objects/data_object")

In [None]:
# Importing MatchedDataSet object

data_matched = func.MatchedDataSet(empty=True)
data_matched.open_MatchedDataSet(home_dir + "GoodCopy/Objects/matched_data_saved")

In [None]:
# Importing Visualizations

vis = np.load(home_dir + "GoodCopy/Objects/UMAP_projections.npz")

UMAP_gower = vis["UMAP_gower"]
UMAP_biodata = vis["UMAP_biodata"]
UMAP_snf = vis["UMAP_snf"]
UMAP_onehot = vis["UMAP_onehot"]

# Matched Clustering Workflow

This is the workflow I used to develop matched clustering labels to be used for ensemble clustering

#### Generating Labels

In [None]:
# Generating Kmedoids Clusters

kmedoids_labels = []

# Iterating through each number of clusters
for i in range(2,10):
    
    labels = []
    
    # Iterating through each matched dataset
    for ds in data_matched.matched_data:
        
        # Generating cluster labels
        labels.append(ds.kMedoids_func(n_clusters= i).labels_)

    # Extending to the rest of the dataset, filling in all patients not part of MatchedDataSet with NA values
    kmedoids_labels.extend(func.MatchedDataSet.PrepForSemiSupervised_matched(ds = data_matched, labels = labels))

print("Done Kmedoids")

In [None]:
# Generating HDBSCAN Labels

labels = []

for ds in data_matched.matched_data:

    labels.append(ds.HDBSCAN_func(precomputed=True).labels_)
    
hdb_labels = func.MatchedDataSet.PrepForSemiSupervised_matched(data_matched, labels)

print("Done HDBSCAN")

In [None]:
# Generating Agglomerative Clustering Labels

agglo_labels = []

for i in range(2,10):
    
    labels = []
    for ds in data_matched.matched_data:

        labels.append(ds.AgglomerativeClustering_func(n_clusters= i, precomputed=True, linkage = "average").labels_)

    agglo_labels.extend(func.MatchedDataSet.PrepForSemiSupervised_matched(data_matched, labels))

print("Done Agglo")

In [None]:
# Generating HDB SNF Labels

labels_HDB = []

for ds in data_matched.matched_data:

    labels_HDB.append(ds.HDBSCAN_func(data = ds.snf_dist, precomputed=True).labels_)
    
    
snf_hdb_labels = func.MatchedDataSet.PrepForSemiSupervised_matched(data_matched, labels_HDB)

In [None]:
# Generating Spectral SNF labels

spec_labels = []

for i in range(2,10):
    
    labels = []
    for ds in data_matched.matched_data:

        labels.append(ds.SpectralClustering_func(n_clusters= i, precomputed=True).labels_)

    spec_labels.extend(func.MatchedDataSet.PrepForSemiSupervised_matched(data_matched, labels))

#### Saving Labels

In [None]:
np.savez(home_dir + "GoodCopy/Objects/labels_matched",kmedoids_labels = kmedoids_labels, 
         hdb_labels = hdb_labels, agglo_labels = agglo_labels, snf_hdb_labels = snf_hdb_labels,
         spec_labels = spec_labels)