In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import umap.umap_ as umap
import seaborn as sns
import matplotlib.colors as mcolors
import numpy as np
from sklearn.feature_selection import VarianceThreshold

In [None]:
# correlation threshold to set

threshold = 0.8

In [None]:
# filter features for high correlation and low variance
def process_dataframe(df_input, group_name, threshold=0.8, variance_threshold=0.01):
    # drop non-feature columns
    correlation_df = df_input.drop(columns=['USER_ID', 'WEEK_START'], errors='ignore')

    # correlation matrix
    corr_matrix = correlation_df.corr().abs()

    # heatmap
    plt.figure(figsize=(18, 14))
    sns.heatmap(corr_matrix, cmap='coolwarm', center=0, square=True, linewidths=0.5)
    plt.title(f'Correlation Heatmap of Weekly Variances ({group_name})')
    plt.tight_layout()
    plt.savefig(f"clustering_plots/correlation_matrix_with_variance_threshold_{threshold}_{group_name}.png", dpi=300)
    plt.close()

    # upper triangle matrix
    upper_triangle_mask = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    upper_triangle = corr_matrix.where(upper_triangle_mask)

    # highly correlated pairs
    high_corr_pairs = (
        upper_triangle.stack()
        .reset_index()
        .rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'})
        .query(f"Correlation >= {threshold}")
        .sort_values(by="Correlation", ascending=False)
    )

    print(f"High correlation pairs for {group_name}:")
    print(high_corr_pairs)

    # drop one variable of each correlated pair
    to_drop = set()
    for col in upper_triangle.columns:
        # skip if this column is already marked for removal
        if col in to_drop:
            continue
        # find partners that are highly correlated with col
        partners = upper_triangle.index[upper_triangle[col] >= threshold]
        to_drop.update(partners)

    df_reduced = df_input.drop(columns=to_drop, errors='ignore')

    # variance threshold
    exclude_cols = ['USER_ID', 'WEEK_START']
    df_filtered = df_reduced.drop(columns=exclude_cols, errors='ignore')

    selector = VarianceThreshold(threshold=variance_threshold)
    selector.fit(df_filtered)

    feature_mask = selector.get_support()
    low_variance_features = df_filtered.columns[~selector.get_support()]

    print(f"Low variance features for {group_name}:")
    print(low_variance_features.tolist())

    # drop low variance features
    df_reduced = df_reduced.drop(columns=low_variance_features, errors='ignore')

    print(f"Final number of features for {group_name}: {len(df_reduced.columns)}")

    return df_reduced

In [None]:
df = pd.read_csv("working_data/mhs_sleep_weekly_features.csv")
print(len(df.columns))
df_gender = pd.read_csv("working_data/demographics_with_age.csv", usecols=['USER_ID', 'GENDER'])
df_gender = pd.merge(df, df_gender, on='USER_ID', how='inner')

df_m = df_gender[df_gender['GENDER'] == 'male']
df_f = df_gender[df_gender['GENDER'] == 'female']

df_m = df_m.drop(columns=['GENDER'])
df_f = df_f.drop(columns=['GENDER'])

In [None]:
df_all_processed = process_dataframe(df, "all", threshold)
#df_male_processed = process_dataframe(df_m, "male", threshold)
#df_female_processed = process_dataframe(df_f, "female", threshold)

In [None]:
# compute sum of survey questions
def add_total_score(df):
    # columns to exclude from the sum
    non_questions = {
        'USER_ID', 'WEEK_START', 'SURVEY_WEEK'
    }
    # all other columns are survey questions
    question_cols = [c for c in df.columns if c not in non_questions]
    # row-wise sum
    df = df.copy()
    df['total_score'] = df[question_cols].sum(axis=1)
    return df

def filter_survey_weeks(df_survey_subset, df_features, n_weeks):
    df_survey_subset = df_survey_subset.copy()
    df_survey_subset['WEEK_START'] = pd.to_datetime(df_survey_subset['WEEK_START'])
    df_features['WEEK_START'] = pd.to_datetime(df_features['WEEK_START'])

    df_features = df_features.rename(columns={'WEEK_START': 'FEATURE_WEEK'})
    df_survey_subset = df_survey_subset.rename(columns={'WEEK_START': 'SURVEY_WEEK'})

    # merge on USER_ID to pair survey weeks with cluster weeks
    df_merged = df_features.merge(df_survey_subset, on='USER_ID', how='inner')

    # filter for same or up to n_weeks weeks before the cluster week
    df_filtered = df_merged[
        (df_merged['SURVEY_WEEK'] <= df_merged['FEATURE_WEEK']) &
        (df_merged['SURVEY_WEEK'] >= (df_merged['FEATURE_WEEK'] - pd.Timedelta(weeks=n_weeks)))
    ]

    df_filtered = df_filtered.rename(columns={'FEATURE_WEEK': 'WEEK_START'})
    return df_filtered

In [None]:
stress = [
    'HOW OFTEN HAVE YOU BEEN UPSET BECAUSE OF SOMETHING THAT HAPPENED UNEXPECTEDLY?',
    'HOW OFTEN HAVE YOU FELT THAT YOU WERE UNABLE TO CONTROL THE IMPORTANT THINGS IN YOUR LIFE?',
    'HOW OFTEN HAVE YOU FELT NERVOUS AND STRESSED?',
    'HOW OFTEN HAVE YOU FELT CONFIDENT ABOUT YOUR ABILITY TO HANDLE YOUR PERSONAL PROBLEMS?',
    'HOW OFTEN HAVE YOU FELT THAT THINGS WERE GOING YOUR WAY?',
    'HOW OFTEN HAVE YOU FOUND THAT YOU COULD NOT COPE WITH ALL THE THINGS THAT YOU HAD TO DO?',
    'HOW OFTEN HAVE YOU BEEN ABLE TO CONTROL IRRITATIONS IN YOUR LIFE?',
    'HOW OFTEN HAVE YOU FELT THAT YOU WERE ON TOP OF THINGS?',
    'HOW OFTEN HAVE YOU BEEN ANGERED BECAUSE OF THINGS THAT WERE OUTSIDE OF YOUR CONTROL?',
    'HOW OFTEN HAVE YOU FELT DIFFICULTIES WERE PILING UP SO HIGH THAT YOU COULD NOT OVERCOME THEM?'
]

depression = [
    'HOW OFTEN HAVE YOU HAD LITTLE INTEREST OR PLEASURE IN DOING THINGS?',
    'HOW OFTEN HAVE YOU FELT DOWN, DEPRESSED OR HOPELESS?',
    'HOW OFTEN HAVE YOU FELT NERVOUS, ANXIOUS OR ON EDGE?',
    'HOW OFTEN HAVE YOU NOT BEEN ABLE TO STOP OR CONTROL WORRYING?'    
]

needs = [
    'I FELT A SENSE OF CONTACT WITH PEOPLE WHO CARE FOR ME, AND WHOM I CARE FOR',
    'I FELT CLOSE AND CONNECTED WITH OTHER PEOPLE WHO ARE IMPORTANT TO ME',
    'I FELT A STRONG SENSE OF INTIMACY WITH THE PEOPLE I SPENT TIME WITH',
    'I FELT THAT I WAS SUCCESSFULLY COMPLETING DIFFICULT TASKS AND PROJECTS',
    'I FELT THAT I WAS TAKING ON AND MASTERING HARD CHALLENGES',
    'I FELT VERY CAPABLE IN WHAT I DID',
    'I FELT THAT MY CHOICES WERE BASED ON MY TRUE INTERESTS AND VALUES',
    'I FELT FREE TO DO THINGS MY OWN WAY',
    'I FELT MY CHOICES EXPRESSED MY “TRUE SELF”'
]

In [None]:
# create total score of survey by category

df_survey = pd.read_csv("working_data/mhs_survey_sorted_without_nan.csv")

df_survey['SUBMITDATE'] = pd.to_datetime(df_survey['SUBMITDATE'])
df_survey['WEEK_START'] = df_survey['SUBMITDATE'] - pd.to_timedelta(df_survey['SUBMITDATE'].dt.weekday, unit='D')
df_survey.drop(columns=["SUBMITDATE"], inplace=True)

# stress
columns_to_keep = ['USER_ID', 'WEEK_START'] + stress
df_survey_stress = df_survey[columns_to_keep].copy()

df_stress = add_total_score(df_survey_stress)
df_stress = df_stress[['USER_ID', 'WEEK_START', 'total_score']]

df_stress = filter_survey_weeks(df_stress, df_all_processed, 3)
df_stress = df_stress.sort_values(by=['USER_ID', 'SURVEY_WEEK'], ascending=[True, False])
df_stress.drop(columns=["SURVEY_WEEK"], inplace=True)
print(f"Number of entries in stress: {len(df_stress)}")

# depression
columns_to_keep = ['USER_ID', 'WEEK_START'] + depression
df_survey_depression = df_survey[columns_to_keep].copy()

df_depression = add_total_score(df_survey_depression)
df_depression = df_depression[['USER_ID', 'WEEK_START', 'total_score']]

df_depression = filter_survey_weeks(df_depression, df_all_processed, 1)
df_depression = df_depression.sort_values(by=['USER_ID', 'SURVEY_WEEK'], ascending=[True, False])
df_depression.drop(columns=["SURVEY_WEEK"], inplace=True)
print(f"Number of entries in depression: {len(df_depression)}")

# needs
columns_to_keep = ['USER_ID', 'WEEK_START'] + needs
df_survey_needs = df_survey[columns_to_keep].copy()

df_needs = add_total_score(df_survey_needs)
df_needs = df_needs[['USER_ID', 'WEEK_START', 'total_score']]

df_needs = filter_survey_weeks(df_needs, df_all_processed, 0)
df_needs = df_needs.sort_values(by=['USER_ID', 'SURVEY_WEEK'], ascending=[True, False])
df_needs.drop(columns=["SURVEY_WEEK"], inplace=True)
print(f"Number of entries in needs: {len(df_needs)}")

In [None]:
def filter_features_by_survey_correlation(df, threshold=0.1):

    id_cols = ['USER_ID', 'WEEK_START']
    # all other columns except the total score are treated as features
    feat_cols = [c for c in df.columns if c not in id_cols + ['total_score']]
    
    # compute Pearson r for each feature vs. total_score
    corrs = df[feat_cols].corrwith(df['total_score'])
    
    print("Correlation with total_score for each feature:")
    for feature, r in corrs.abs().sort_values(ascending=False).items():
        print(f"{feature:<40}: {r:.3f}")
    
    # select those with abs(r) > threshold
    kept = corrs.abs()[corrs.abs() > threshold].index.tolist()
    
    return df[id_cols + kept]

In [None]:
# stress
# create dataframe with survey data
survey_threshold = 0.05

df_selected = filter_features_by_survey_correlation(df_stress, threshold=survey_threshold)

# keep selected columns in features dataset
selected_cols = df_selected.columns.tolist()
df_all_filtered = df_all_processed[selected_cols].copy()
print(len(df_all_filtered.columns), "columns kept:")
print(df_all_filtered.columns.tolist())

print(len(df_all_filtered)-2)

group_name = "all"

df_all_filtered.to_csv(f"working_data/mhs_sleep_weekly_uncorr_features_correlation_threshold_{threshold}_{group_name}_stress.csv", index=False)

In [None]:
# depression
# create dataframe with survey data
survey_threshold = 0.05

df_selected = filter_features_by_survey_correlation(df_depression, threshold=survey_threshold)

# keep selected columns in features dataset
selected_cols = df_selected.columns.tolist()
df_all_filtered = df_all_processed[selected_cols].copy()
print(len(df_all_filtered.columns), "columns kept:")
print(df_all_filtered.columns.tolist())

print(len(df_all_filtered)-2)

group_name = "all"

df_all_filtered.to_csv(f"working_data/mhs_sleep_weekly_uncorr_features_correlation_threshold_{threshold}_{group_name}_depression.csv", index=False)

In [None]:
# needs
# create dataframe with survey data
survey_threshold = 0.05

df_selected = filter_features_by_survey_correlation(df_needs, threshold=survey_threshold)

# keep selected columns in features dataset
selected_cols = df_selected.columns.tolist()
df_all_filtered = df_all_processed[selected_cols].copy()
print(len(df_all_filtered.columns), "columns kept:")
print(df_all_filtered.columns.tolist())

print(len(df_all_filtered)-2)

group_name = "all"

df_all_filtered.to_csv(f"working_data/mhs_sleep_weekly_uncorr_features_correlation_threshold_{threshold}_{group_name}_needs.csv", index=False)