In [None]:
from scipy.stats import pearsonr
import numpy as np
import pandas as pd
import itertools

In [None]:
# parameters
clustering_method = "HAC" # out of {"GMM", "HAC"}
correlation_threshold = 0.8
dim_red_method = "umap" # out of {"umap", "pca", "tsne"}
dim_red_method_upper = dim_red_method.upper()
perplexity = 50 # t-SNE only
group = "all" # out of {"male", "female", "all"}
best_n_components = 3 # found in GMM and HAC clustering
survey_category = "stress" # out of {"stress", "depression", "needs"}

In [None]:
# load labels from clustering and uncorrelated features

if dim_red_method == "tsne":
    label_path = (
        f"working_data/cluster_labels/"
        f"{clustering_method}_labels_{best_n_components}_clusters_on_"
        f"{dim_red_method_upper}_correlation_threshold_{correlation_threshold}"
        f"_perplexity_{perplexity}_{group}_{survey_category}.csv"
    )
else:
    label_path = (
        f"working_data/cluster_labels/"
        f"{clustering_method}_labels_{best_n_components}_clusters_on_"
        f"{dim_red_method_upper}_correlation_threshold_{correlation_threshold}"
        f"_{group}_{survey_category}.csv"
    )
    
df_clusters = pd.read_csv(label_path)
df_clusters['WEEK_START'] = pd.to_datetime(df_clusters['WEEK_START'])

df_features = pd.read_csv(f"working_data/mhs_sleep_weekly_uncorr_features_correlation_threshold_{correlation_threshold}_{group}_{survey_category}.csv")
df_features = df_features.dropna()
df_features['WEEK_START'] = pd.to_datetime(df_features['WEEK_START'])

id_cols = ["USER_ID", "WEEK_START"]

In [None]:
# for HAC the cluster labels have to be changed to be similar to GMM, see file external_validation_GMM_HAC as a reference
# and visualization

if (clustering_method == "HAC"):
    if (survey_category == "stress"):
        #df_clusters['cluster_label'] = df_clusters['cluster_label'].replace({1: 2, 2: 1})
        pass
    elif (survey_category == "depression"):
        df_clusters['cluster_label'] = df_clusters['cluster_label'].replace({2: 3, 3: 2})
        pass
    else: # needs
        #df_clusters['cluster_label'] = df_clusters['cluster_label'].replace({2: 3, 3: 2})
        pass

In [None]:
# merge labels and features

df_corr = df_features.merge(df_clusters, on=id_cols, how='inner', validate='many_to_many')
df_corr.drop(columns=["UMAP_1", "UMAP_2"], inplace=True)

In [None]:
# permutation based algorithm
# cluster labels are interpreted numerically and a correlation is computed for all combinations of cluster values
# for each feature

df_corr['cluster_label'] = df_corr['cluster_label'].replace({1: 'x', 2: 'y', 3: 'z'})

feature_cols = [c for c in df_corr.columns if c not in ['USER_ID','WEEK_START','cluster_label']]

perms = list(itertools.permutations([1,2,3]))
labels = ['x','y','z']

records = []

for feat in feature_cols:
    best = {'corr': 0, 'mapping': None}
    for p in perms:
        m = dict(zip(labels, p))
        # map the textual clusters to numbers
        encoded = df_corr['cluster_label'].map(m)
        # compute Pearson r
        r = df_corr[feat].corr(encoded)
        if r > best['corr']:
            best['corr']    = r
            best['mapping'] = m
    records.append({
        'feature': feat,
        'best_corr': best['corr'],
        'best_map': best['mapping']
    })
    
results = pd.DataFrame(records)
results.sort_values(by='best_corr', ascending=False, inplace=True)
results.reset_index(drop=True, inplace=True)
print(results)