In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import kneighbors_graph
from datetime import datetime
import matplotlib.patches as mpatches

In [None]:
# parameters to set

correlation_threshold = 0.8 # set this
dim_red_method = "umap" # out of {"umap", "pca", "tsne"}
perplexity = 50 # only used in tsne
standardization = True
group = "all" # out of {"male", "female", "all"}
survey_category = "stress" # out of {"stress", "depression", "needs"}
seed = 1 # for reproducability

In [None]:
dim_red_method_upper = dim_red_method.upper()
if (dim_red_method == "tsne"):
    if (standardization):
        df = pd.read_csv(f"working_data/{dim_red_method}_features_correlation_threshold_{correlation_threshold}_perplexity_{perplexity}_with_standardization_{group}_{survey_category}.csv")
    else:
        df = pd.read_csv(f"working_data/{dim_red_method}_features_correlation_threshold_{correlation_threshold}_perplexity_{perplexity}_without_standardization_{group}_{survey_category}.csv")
else:
    if (standardization):
        df = pd.read_csv(f"working_data/{dim_red_method}_features_correlation_threshold_{correlation_threshold}_with_standardization_{group}_{survey_category}.csv")
    else:
        df = pd.read_csv(f"working_data/{dim_red_method}_features_correlation_threshold_{correlation_threshold}_without_standardization_{group}_{survey_category}.csv")
X = df.iloc[:, 2:].to_numpy()
ids = df.iloc[:, :2] # USER_ID and WEEK_START as identifiers

In [None]:
# cross-validation to find most suitable number of clusters

X_trainval, X_test = train_test_split(X, test_size=0.20, random_state=seed)

scaler = StandardScaler().fit(X_trainval)
X_trainval_sc = scaler.transform(X_trainval)
X_test_sc = scaler.transform(X_test)

components_range = range(2, 11)
mean_silhouettes, mean_aris = [], []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)
base_graph = kneighbors_graph(X_trainval_sc, n_neighbors=90, include_self=False, metric='euclidean')

for n in components_range:
    # reference model on full train+val
    hac_ref  = AgglomerativeClustering(n_clusters=n, linkage='ward',
                                       connectivity=base_graph).fit(X_trainval_sc)
    lbl_ref_all = hac_ref.labels_

    fold_sil, fold_ari = [], []
    for tr_idx, val_idx in kf.split(X_trainval_sc):
        X_tr, X_val = X_trainval_sc[tr_idx], X_trainval_sc[val_idx]

        # sparse graph restricted to training fold
        cv_graph = kneighbors_graph(X_tr, n_neighbors=30, include_self=False, metric='euclidean')
        
        hac_cv = AgglomerativeClustering(n_clusters=n, linkage='ward', connectivity=cv_graph).fit(X_tr)

        lbl_tr_cv  = hac_cv.labels_
        lbl_tr_ref = lbl_ref_all[tr_idx]

        fold_sil.append(silhouette_score(X_tr, lbl_tr_cv))

        fold_ari.append(adjusted_rand_score(lbl_tr_ref, lbl_tr_cv))

    mean_silhouettes.append(np.mean(fold_sil))
    mean_aris.append(np.mean(fold_ari))

    print(f"n={n:2d} | Silhouette={mean_silhouettes[-1]:.3f} | ARI={mean_aris[-1]:.3f}")

best_n = components_range[np.argmax(mean_silhouettes)]
print(f"Chosen n_clusters (by CV silhouette): {best_n}")

In [None]:
# Silhouette
plt.figure(figsize=(10,4))
plt.plot(components_range, mean_silhouettes, label='CV Silhouette', marker='o', color='tab:red')
plt.axvline(best_n, ls='--', c='gray')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
#plt.title(f'HAC CV Silhouette vs. #Clusters ({dim_red_method_upper}, {group}, {survey_category})')
plt.legend()
plt.grid()

if dim_red_method == "tsne":
    plt.savefig(f'clustering_plots/silhouette_HAC_on_{dim_red_method_upper}_correlation_{correlation_threshold}_perplexity_{perplexity}_{group}_{survey_category}.png',
                dpi=300, bbox_inches='tight', pad_inches=0.1, facecolor='white')
else:
    plt.savefig(f'clustering_plots/silhouette_HAC_on_{dim_red_method_upper}_correlation_{correlation_threshold}_{group}_{survey_category}.png',
                dpi=300, bbox_inches='tight', pad_inches=0.1, facecolor='white')
    
plt.show()

# ARI
plt.figure(figsize=(10,4))
plt.plot(components_range, mean_aris, label='CV ARI', marker='v', color='tab:green')
plt.axvline(best_n, ls='--', c='gray')
plt.xlabel('Number of Clusters')
plt.ylabel('Adjusted Rand Index')
#plt.title(f'HAC CV ARI vs. #Clusters ({dim_red_method_upper}, {group}, {survey_category})')
plt.legend()
plt.grid()

if dim_red_method == "tsne":
    plt.savefig(f'clustering_plots/ari_HAC_on_{dim_red_method_upper}_correlation_{correlation_threshold}_perplexity_{perplexity}_{group}_{survey_category}.png',
                dpi=300, bbox_inches='tight', pad_inches=0.1, facecolor='white')
else:
    plt.savefig(f'clustering_plots/ari_HAC_on_{dim_red_method_upper}_correlation_{correlation_threshold}_{group}_{survey_category}.png',
                dpi=300, bbox_inches='tight', pad_inches=0.1, facecolor='white')
plt.show()

In [None]:
# evaluation on test set

best_n_components = best_n
#best_n_components = 3

final_graph = kneighbors_graph(X_trainval_sc, n_neighbors=90, include_self=False, metric='euclidean')
final_hac = AgglomerativeClustering(n_clusters=best_n_components, linkage='ward', connectivity=final_graph).fit(X_trainval_sc)

# connectivity for test set
test_graph = kneighbors_graph(X_test_sc, n_neighbors=90, include_self=False, metric='euclidean')
test_hac = AgglomerativeClustering(n_clusters=best_n_components, linkage='ward',connectivity=test_graph).fit(X_test_sc)

labels_test = test_hac.labels_ + 1
test_silhouette = silhouette_score(X_test_sc, labels_test)

print(f"Best n_clusters: {best_n_components}")
print(f"Silhouette score on test set: {test_silhouette:.3f}")

scatter = plt.scatter(X_test_sc[:, 0], X_test_sc[:, 1], c=labels_test, cmap='turbo', s=1, alpha=0.2)
plt.xlabel(f"{dim_red_method_upper} Component 1")
plt.ylabel(f"{dim_red_method_upper} Component 2")
#plt.title(f'HAC with {best_n_components} clusters on {dim_red_method_upper} ({group}, {survey_category})')

unique_labels = np.unique(labels_test)
colors = [scatter.cmap(scatter.norm(label)) for label in unique_labels]
legend_patches = [mpatches.Patch(color=color, label=f"Cluster {label}") for label, color in zip(unique_labels, colors)]
plt.legend(handles=legend_patches)

if dim_red_method == "tsne":
    plt.savefig(f'clustering_plots/HAC_with_{best_n_components}_clusters_on_{dim_red_method_upper}_with_correlation_threshold_{correlation_threshold}_and_perplexity_{perplexity}_{group}_{survey_category}.png', 
                dpi=300, bbox_inches='tight', pad_inches=0.1, facecolor='white')
else:
    plt.savefig(f'clustering_plots/HAC_with_{best_n_components}_clusters_on_{dim_red_method_upper}_with_correlation_threshold_{correlation_threshold}_{group}_{survey_category}.png', 
                dpi=300, bbox_inches='tight', pad_inches=0.1, facecolor='white')

plt.show()

In [None]:
# compute HAC labels on all data
labels_all = AgglomerativeClustering(
    n_clusters=best_n_components,
    linkage='ward',
    connectivity=kneighbors_graph(scaler.transform(X), n_neighbors=90, include_self=False)
).fit_predict(scaler.transform(X)) + 1

hac_labels_df = df.iloc[:, :2].copy()  # USER_ID and WEEK_START
hac_labels_df[f"{dim_red_method_upper}_1"] = X[:, 0]
hac_labels_df[f"{dim_red_method_upper}_2"] = X[:, 1]
hac_labels_df["cluster_label"] = labels_all

if dim_red_method == "tsne":
    label_path = (
        f"working_data/cluster_labels/HAC_labels_{best_n_components}_clusters_on_"
        f"{dim_red_method_upper}_correlation_threshold_{correlation_threshold}"
        f"_perplexity_{perplexity}_{group}_{survey_category}.csv"
    )
else:
    label_path = (
        f"working_data/cluster_labels/HAC_labels_{best_n_components}_clusters_on_"
        f"{dim_red_method_upper}_correlation_threshold_{correlation_threshold}"
        f"_{group}_{survey_category}.csv"
    )

hac_labels_df.to_csv(label_path, index=False)