In [None]:
from statsmodels.stats.multitest import multipletests
import statsmodels.formula.api as smf
import statsmodels.api as sm
from scipy.stats import kruskal, chi2_contingency, pearsonr

from scipy.stats import shapiro, levene, probplot, wilcoxon
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# parameters
clustering_method = "HAC" # out of {"GMM", "HAC"}
correlation_threshold = 0.8
dim_red_method = "umap" # out of {"umap", "pca", "tsne"}
dim_red_method_upper = dim_red_method.upper()
perplexity = 50 # t-SNE only
group = "all" # "male", "female", "all"
best_n_components = 3 # found in GMM and HAC clustering

In [None]:
# load clustering data with cluster labels for each category as well as demographic and survey data
if dim_red_method == "tsne":
    label_path = (
        f"working_data/cluster_labels/"
        f"{clustering_method}_labels_{best_n_components}_clusters_on_"
        f"{dim_red_method_upper}_correlation_threshold_{correlation_threshold}"
        f"_perplexity_{perplexity}_{group}_"
    )
else:
    label_path = (
        f"working_data/cluster_labels/"
        f"{clustering_method}_labels_{best_n_components}_clusters_on_"
        f"{dim_red_method_upper}_correlation_threshold_{correlation_threshold}"
        f"_{group}_"
    )

df_clusters_stress = pd.read_csv(label_path + "stress.csv")
df_clusters_depression = pd.read_csv(label_path + "depression.csv")
df_clusters_needs = pd.read_csv(label_path + "needs.csv")

df_clusters_stress['WEEK_START'] = pd.to_datetime(df_clusters_stress['WEEK_START'])
df_clusters_depression['WEEK_START'] = pd.to_datetime(df_clusters_depression['WEEK_START'])
df_clusters_needs['WEEK_START'] = pd.to_datetime(df_clusters_needs['WEEK_START'])

# load survey data
df_survey = pd.read_csv("working_data/mhs_survey_sorted_without_nan.csv")

df_survey['SUBMITDATE'] = pd.to_datetime(df_survey['SUBMITDATE'])
df_survey['WEEK_START'] = (df_survey['SUBMITDATE'] - pd.to_timedelta(df_survey['SUBMITDATE'].dt.weekday, unit='D'))
df_survey.drop(columns=['SUBMITDATE'], inplace=True)

# load demographics data and calculate age
df_demographics = pd.read_csv("original_data/mhs_demographics_sorted.csv")

df_demographics["FIRST_SUBMISSION_DATE"] = pd.to_datetime(df_demographics["FIRST_SUBMISSION_DATE"])
df_demographics["LAST_SUBMISSION_DATE"] = pd.to_datetime(df_demographics["LAST_SUBMISSION_DATE"])
df_demographics["BIRTHDAY"] = pd.to_datetime(df_demographics["BIRTHDAY"])

df_demographics["MIDPOINT_DATE"] = df_demographics["FIRST_SUBMISSION_DATE"] + (df_demographics["LAST_SUBMISSION_DATE"] - df_demographics["FIRST_SUBMISSION_DATE"]) / 2
df_demographics["MIDPOINT_DATE"] = pd.to_datetime(df_demographics["MIDPOINT_DATE"])
df_demographics["AGE"] = df_demographics.apply(lambda row: row["MIDPOINT_DATE"].year - row["BIRTHDAY"].year - ((row["MIDPOINT_DATE"].month, row["MIDPOINT_DATE"].day) < (row["BIRTHDAY"].month, row["BIRTHDAY"].day)), axis=1)
df_demographics.rename(columns={"WHOOP_BMI": "BMI"}, inplace=True)
df_demographics = df_demographics[["USER_ID", "AGE", "GENDER", "BMI"]].copy()

In [None]:
stress = [
    'HOW OFTEN HAVE YOU BEEN UPSET BECAUSE OF SOMETHING THAT HAPPENED UNEXPECTEDLY?',
    'HOW OFTEN HAVE YOU FELT THAT YOU WERE UNABLE TO CONTROL THE IMPORTANT THINGS IN YOUR LIFE?',
    'HOW OFTEN HAVE YOU FELT NERVOUS AND STRESSED?',
    'HOW OFTEN HAVE YOU FELT CONFIDENT ABOUT YOUR ABILITY TO HANDLE YOUR PERSONAL PROBLEMS?',
    'HOW OFTEN HAVE YOU FELT THAT THINGS WERE GOING YOUR WAY?',
    'HOW OFTEN HAVE YOU FOUND THAT YOU COULD NOT COPE WITH ALL THE THINGS THAT YOU HAD TO DO?',
    'HOW OFTEN HAVE YOU BEEN ABLE TO CONTROL IRRITATIONS IN YOUR LIFE?',
    'HOW OFTEN HAVE YOU FELT THAT YOU WERE ON TOP OF THINGS?',
    'HOW OFTEN HAVE YOU BEEN ANGERED BECAUSE OF THINGS THAT WERE OUTSIDE OF YOUR CONTROL?',
    'HOW OFTEN HAVE YOU FELT DIFFICULTIES WERE PILING UP SO HIGH THAT YOU COULD NOT OVERCOME THEM?'
]

depression = [
    'HOW OFTEN HAVE YOU HAD LITTLE INTEREST OR PLEASURE IN DOING THINGS?',
    'HOW OFTEN HAVE YOU FELT DOWN, DEPRESSED OR HOPELESS?',
    'HOW OFTEN HAVE YOU FELT NERVOUS, ANXIOUS OR ON EDGE?',
    'HOW OFTEN HAVE YOU NOT BEEN ABLE TO STOP OR CONTROL WORRYING?'    
]

needs = [
    'I FELT A SENSE OF CONTACT WITH PEOPLE WHO CARE FOR ME, AND WHOM I CARE FOR',
    'I FELT CLOSE AND CONNECTED WITH OTHER PEOPLE WHO ARE IMPORTANT TO ME',
    'I FELT A STRONG SENSE OF INTIMACY WITH THE PEOPLE I SPENT TIME WITH',
    'I FELT THAT I WAS SUCCESSFULLY COMPLETING DIFFICULT TASKS AND PROJECTS',
    'I FELT THAT I WAS TAKING ON AND MASTERING HARD CHALLENGES',
    'I FELT VERY CAPABLE IN WHAT I DID',
    'I FELT THAT MY CHOICES WERE BASED ON MY TRUE INTERESTS AND VALUES',
    'I FELT FREE TO DO THINGS MY OWN WAY',
    'I FELT MY CHOICES EXPRESSED MY “TRUE SELF”'
]

In [None]:
# build data frames for each survey category

columns_to_keep = ['USER_ID', 'WEEK_START'] + stress
df_survey_stress = df_survey[columns_to_keep].copy()

columns_to_keep = ['USER_ID', 'WEEK_START'] + depression
df_survey_depression = df_survey[columns_to_keep].copy()

columns_to_keep = ['USER_ID', 'WEEK_START'] + needs
df_survey_needs = df_survey[columns_to_keep].copy()

In [None]:
# function to merge survey and cluster subsets and add number of weeks to include before survey week

def filter_survey_weeks(df_survey_subset, df_clusters_subset, n_weeks):
    df_survey_subset = df_survey_subset.copy()
    df_survey_subset['WEEK_START'] = pd.to_datetime(df_survey_subset['WEEK_START'])

    df_clusters_subset = df_clusters_subset.rename(columns={'WEEK_START': 'CLUSTER_WEEK'})
    df_survey_subset = df_survey_subset.rename(columns={'WEEK_START': 'SURVEY_WEEK'})

    # merge on USER_ID to pair survey weeks with cluster weeks
    df_merged = df_clusters_subset.merge(df_survey_subset, on='USER_ID', how='inner')

    # filter for same or up to n_weeks weeks before the cluster week
    df_filtered = df_merged[
        (df_merged['SURVEY_WEEK'] <= df_merged['CLUSTER_WEEK']) &
        (df_merged['SURVEY_WEEK'] >= (df_merged['CLUSTER_WEEK'] - pd.Timedelta(weeks=n_weeks)))
    ]

    df_filtered = df_filtered.rename(columns={'CLUSTER_WEEK': 'WEEK_START'})
    return df_filtered

In [None]:
# merge survey and cluster subsets
# now we have cluster_labels and all survey question answers in one frame for each category

df_stress = filter_survey_weeks(df_survey_stress, df_clusters_stress, 3)
df_stress = df_stress.sort_values(by=['USER_ID', 'SURVEY_WEEK'], ascending=[True, False])
print(f"Number of entries in stress: {len(df_stress)}")

df_depression = filter_survey_weeks(df_survey_depression, df_clusters_depression, 1)
df_depression = df_depression.sort_values(by=['USER_ID', 'SURVEY_WEEK'], ascending=[True, False])
print(f"Number of entries in depression: {len(df_depression)}")

df_needs = filter_survey_weeks(df_survey_needs, df_clusters_needs, 0)
df_needs = df_needs.sort_values(by=['USER_ID', 'SURVEY_WEEK'], ascending=[True, False])
print(f"Number of entries in needs: {len(df_needs)}")

In [None]:
# find total score of survey question of category

def add_total_score(df):
    # columns to exclude from the sum
    non_questions = {
        'USER_ID', 'WEEK_START', 'SURVEY_WEEK',
        'UMAP_1', 'UMAP_2', 'cluster_label'
    }
    # every other column is assumed to be a survey question
    question_cols = [c for c in df.columns if c not in non_questions]
    # row-wise sum
    df = df.copy()
    df['total_score'] = df[question_cols].sum(axis=1)
    return df

df_stress = add_total_score(df_stress)
df_depression = add_total_score(df_depression)
df_needs = add_total_score(df_needs)

In [None]:
# function to create table with mean and interquartile range of total score

def make_summary_table(df, label, variable):
    summary = df.groupby('cluster_label')[variable].agg(
        median_score='median',
        q1=lambda x: np.percentile(x, 25),
        q3=lambda x: np.percentile(x, 75)
    ).reset_index()

    summary['Interquartile range'] = summary.apply(
        lambda row: f"{int(row['q1'])}–{int(row['q3'])}", axis=1
    )
    summary['Median score'] = summary['median_score'].astype(int)

    final = summary[['cluster_label', 'Median score', 'Interquartile range']]
    final.columns = ['Cluster', f'Median {variable} Score', 'Interquartile range']

    print(f"--- {label.upper()} Summary Table ---")
    print(final.to_string(index=False))

In [None]:
# show summary tables

make_summary_table(df_stress, "Stress", 'total_score')
make_summary_table(df_depression, "Depression", 'total_score')
make_summary_table(df_needs, "Needs", 'total_score')

In [None]:
# create data frames of demographics and cluster labels of survey category
df_demo_cluster_stress = df_demographics.merge(df_clusters_stress, on='USER_ID', how="inner")
df_demo_cluster_depression = df_demographics.merge(df_clusters_depression, on='USER_ID', how="inner")
df_demo_cluster_needs = df_demographics.merge(df_clusters_needs, on='USER_ID', how="inner")

# show summary tables of demographics
make_summary_table(df_demo_cluster_stress, "Stress", 'AGE')
make_summary_table(df_demo_cluster_depression, "Depression", 'AGE')
make_summary_table(df_demo_cluster_needs, "Needs", 'AGE')

make_summary_table(df_demo_cluster_stress, "Stress", 'BMI')
make_summary_table(df_demo_cluster_depression, "Depression", 'BMI')
make_summary_table(df_demo_cluster_needs, "Needs", 'BMI')

# gender summary table
gender_counts = df_demo_cluster_stress.groupby(['cluster_label', 'GENDER']).size().unstack(fill_value=0)
gender_ratios = gender_counts.div(gender_counts.sum(axis=1), axis=0)
print("==== Stress ====")
print("Gender ratios per cluster:")
print(gender_ratios.round(3))

gender_counts = df_demo_cluster_depression.groupby(['cluster_label', 'GENDER']).size().unstack(fill_value=0)
gender_ratios = gender_counts.div(gender_counts.sum(axis=1), axis=0)
print("==== Depression ====")
print("Gender ratios per cluster:")
print(gender_ratios.round(3))

gender_counts = df_demo_cluster_needs.groupby(['cluster_label', 'GENDER']).size().unstack(fill_value=0)
gender_ratios = gender_counts.div(gender_counts.sum(axis=1), axis=0)
print("==== Needs ====")
print("Gender ratios per cluster:")
print(gender_ratios.round(3))

In [None]:
# function to check feature correlation with survey total scores and adjust for confounding variables

def feature_correlation(df_category, df_features, df_demographics, category, confounders=('AGE', 'GENDER', 'BMI')):
    
    id_cols = ['USER_ID', 'WEEK_START']
    df_category = df_category[id_cols + ['cluster_label', 'total_score']]
    
    # only keep confounders in df_demographics
    df_demographics = df_demographics[['USER_ID'] + list(confounders)]
    
    # merge survey category and demographics
    df_category = df_category.merge(df_demographics, on='USER_ID', how='left', validate='many_to_many')    
    
    # merge everything into one data frame
    df = df_category.merge(df_features, on=id_cols, how='inner', validate='many_to_many')
    
    exclude_cols = set(id_cols + ['cluster_label', 'total_score'] + list(confounders))
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    
    clusters = sorted(df['cluster_label'].unique())
    results = {}

    for cluster in clusters:
        df_cluster = df[df['cluster_label'] == cluster].dropna()
        
        # regress out confounding variables from total score and create residuals
        formula_t = "total_score ~ AGE + C(GENDER) + BMI"
        resid_t = smf.ols(formula_t, data=df_cluster).fit().resid
        
        corrs = []

        for feature in feature_cols:
            # regress out confounding variables from features and create residuals
            formula_f = f"{feature} ~ AGE + C(GENDER) + BMI"
            resid_f = smf.ols(formula_f, data=df_cluster).fit().resid
            
            try:
                r, p = pearsonr(resid_t, resid_f)
            except Exception:
                r, p = np.nan, np.nan
            
            corrs.append({
                'Variable': feature,
                'r_adj': round(r, 4),
                'p_adj': p
            })

        results[cluster] = pd.DataFrame(corrs)
    
    print(f"=== {category} (adjusted) ===")

    for cluster_id, df_corr in results.items():
        print(f"--- Pearson Correlations for Cluster {cluster_id} ---")
        print(df_corr.to_string(index=False))

In [None]:
# load features for each category
df_features_stress = pd.read_csv(f"working_data/mhs_sleep_weekly_uncorr_features_correlation_threshold_{correlation_threshold}_{group}_stress.csv")
df_features_stress = df_features_stress.dropna()
df_features_stress['WEEK_START'] = pd.to_datetime(df_features_stress['WEEK_START'])

df_features_depression = pd.read_csv(f"working_data/mhs_sleep_weekly_uncorr_features_correlation_threshold_{correlation_threshold}_{group}_depression.csv")
df_features_depression = df_features_depression.dropna()
df_features_depression['WEEK_START'] = pd.to_datetime(df_features_depression['WEEK_START'])

df_features_needs = pd.read_csv(f"working_data/mhs_sleep_weekly_uncorr_features_correlation_threshold_{correlation_threshold}_{group}_needs.csv")
df_features_needs = df_features_needs.dropna()
df_features_needs['WEEK_START'] = pd.to_datetime(df_features_needs['WEEK_START'])

feature_correlation(df_stress, df_features_stress, df_demographics, "Stress")
feature_correlation(df_depression, df_features_depression, df_demographics, "Depression")
feature_correlation(df_needs, df_features_needs, df_demographics, "Needs")