In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
from sklearn.metrics import confusion_matrix

In [None]:
# parameters
correlation_threshold = 0.8
dim_red_method = "umap" # out of {"umap", "pca", "tsne"}
dim_red_method_upper = dim_red_method.upper()
perplexity = 50 # t-SNE only
group = "all" # out of {"male", "female", "all"}
best_n_components = 3 # found in GMM and HAC clustering
survey_category = "stress" # out of {"stress", "depression", "needs"}

In [None]:
# load GMM and HAC labels

if dim_red_method == "tsne":
    label_path_gmm = (
        f"working_data/cluster_labels/"
        f"GMM_labels_{best_n_components}_clusters_on_"
        f"{dim_red_method_upper}_correlation_threshold_{correlation_threshold}"
        f"_perplexity_{perplexity}_{group}_{survey_category}.csv"
    )
    label_path_hac = (
        f"working_data/cluster_labels/"
        f"HAC_labels_{best_n_components}_clusters_on_"
        f"{dim_red_method_upper}_correlation_threshold_{correlation_threshold}"
        f"_perplexity_{perplexity}_{group}_{survey_category}.csv"
    )
else:
    label_path_gmm = (
        f"working_data/cluster_labels/"
        f"GMM_labels_{best_n_components}_clusters_on_"
        f"{dim_red_method_upper}_correlation_threshold_{correlation_threshold}"
        f"_{group}_{survey_category}.csv"
    )
    label_path_hac = (
        f"working_data/cluster_labels/"
        f"HAC_labels_{best_n_components}_clusters_on_"
        f"{dim_red_method_upper}_correlation_threshold_{correlation_threshold}"
        f"_{group}_{survey_category}.csv"
    )
      
df_gmm = pd.read_csv(label_path_gmm)
df_hac = pd.read_csv(label_path_hac)

df_gmm.rename(columns={'cluster_label': 'cluster_label_gmm'}, inplace=True)
df_hac.rename(columns={'cluster_label': 'cluster_label_hac'}, inplace=True)

df = df_gmm.merge(df_hac, on=['USER_ID', 'WEEK_START', 'UMAP_1', 'UMAP_2'], how='inner', validate='many_to_many')

print(df_gmm.columns)
print(df_hac.columns)
print(df.columns)

print(len(df))

In [None]:
# switch cluster labels for comparison for each survey category separately

if (survey_category == "stress"):
    #df['cluster_label_hac'] = df['cluster_label_hac'].replace({1: 2, 2: 1})
    pass
elif (survey_category == "depression"):
    df['cluster_label_hac'] = df['cluster_label_hac'].replace({2: 3, 3: 2})
    pass
else: # needs
    #df['cluster_label_hac'] = df['cluster_label_hac'].replace({2: 3, 3: 2})
    pass

In [None]:
# create clustering plot of both methods

coords = df[['UMAP_1','UMAP_2']].values
labels_dict = {
    'GMM': df['cluster_label_gmm'].values,
    'HAC': df['cluster_label_hac'].values
}


for method, labels in labels_dict.items():
    plt.figure()
    scatter = plt.scatter(coords[:,0], coords[:,1], c=labels, cmap='turbo', s=1, alpha=0.2)
    plt.xlabel(f"{dim_red_method_upper} Component 1")
    plt.ylabel(f"{dim_red_method_upper} Component 2")
    #plt.title(f"{method} with {best_n_components} clusters on {dim_red_method_upper} ({group})")

    unique_labels = np.unique(labels)
    colors = [ scatter.cmap(scatter.norm(l)) for l in unique_labels ]
    patches = [
        mpatches.Patch(color=c, label=f"Cluster {l}")
        for l,c in zip(unique_labels, colors)
    ]
    plt.legend(handles=patches, title="Cluster", loc='best')

    fname = (
        f"clustering_plots/{method}_with_{best_n_components}_clusters_on_"
        f"{dim_red_method_upper}_correlation_threshold_{correlation_threshold}"
    )
    
    if dim_red_method == "tsne":
        fname += f"_and_perplexity_{perplexity}"
    fname += f"_{group}_{survey_category}.png"

    plt.savefig(fname,
                dpi=300,
                bbox_inches='tight',
                pad_inches=0.1,
                facecolor='white')
    plt.show()

In [None]:
labels_gmm = df['cluster_label_gmm']
labels_hac = df['cluster_label_hac']

print(labels_gmm.head())
print(labels_hac.head())

In [None]:
# external validation metrics

ari = adjusted_rand_score(labels_gmm, labels_hac)
print(f"Adjusted rand index: {ari}")

nmi = normalized_mutual_info_score(labels_gmm, labels_hac)
print(f"Normalized mutual information score: {nmi}")

h = homogeneity_score(labels_gmm, labels_hac)
c = completeness_score(labels_gmm, labels_hac)
v = v_measure_score(labels_gmm, labels_hac)
print(f"Homogeneity Score: {h:.6f}")
print(f"Completeness Score: {c:.6f}")
print(f"V-Measure Score:    {v:.6f}")

cm = confusion_matrix(labels_gmm, labels_hac)
pd.DataFrame(cm)

num_differences = (labels_gmm != labels_hac).sum()
print(f"Number of differences: {num_differences}")

In [None]:
# create overlap clusters if clustering methods disagree on cluster label

def resolve_combined_cluster(row):
    gmm = row['cluster_label_gmm']
    hac = row['cluster_label_hac']
    
    if gmm == hac:
        return gmm
    elif {gmm, hac} == {1, 2}:
        return 4
    elif {gmm, hac} == {1, 3}:
        return 5
    elif {gmm, hac} == {2, 3}:
        return 6
    else:
        return -1

df['cluster_label_combined'] = df.apply(resolve_combined_cluster, axis=1)

In [None]:
# plot clusters with overlap

X_plot = df[['UMAP_1', 'UMAP_2']].to_numpy()
combined_labels = df['cluster_label_combined'].to_numpy()

scatter = plt.scatter(X_plot[:, 0], X_plot[:, 1], c=combined_labels, cmap='turbo', s=1, alpha=0.2)
plt.xlabel(f"{dim_red_method_upper} Component 1")
plt.ylabel(f"{dim_red_method_upper} Component 2")

unique_labels = np.unique(combined_labels)
colors = [scatter.cmap(scatter.norm(label)) for label in unique_labels]
legend_patches = [mpatches.Patch(color=color, label=f"Cluster {label}") for label, color in zip(unique_labels, colors)]
plt.legend(handles=legend_patches, title="Combined Cluster")
filename = (f'clustering_plots/clustering_combined_GMM_HAC_{dim_red_method_upper}_correlation_{correlation_threshold}_survey_{group}_{survey_category}.png')
plt.savefig(filename, dpi=300, bbox_inches='tight', pad_inches=0.1, facecolor='white')
plt.show()

In [None]:
# print number of datapoints for GMM, HAC and the combined model

print("GMM: number of datapoints per cluster")
print(df['cluster_label_gmm'].value_counts())
print("HAC: number of datapoints per cluster")
print(df['cluster_label_hac'].value_counts())
print("Combined: number of datapoints per cluster")
print(df['cluster_label_combined'].value_counts())