In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import pickle
import os
from datetime import datetime
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

from analysis.load_data import load_dataset
from analysis.utils.preprocessing import prepare_clustering_features, compute_gower_distance
from analysis.utils.visualization import plot_cluster_distribution
from analysis.clustering.clustering_utils import run_hdbscan_clustering, evaluate_clustering
from analysis.config.model_config import CLUSTERING_CONFIG

In [2]:
# Load tech survey data
tech_survey_df = load_dataset('tech_survey')
print(f"Loaded {len(tech_survey_df)} survey responses")
print(f"Columns: {list(tech_survey_df.columns)}")
tech_survey_df.head()

INFO:botocore.credentials:Found credentials in environment variables.
INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.16.0, Python Version: 3.11.13, Platform: macOS-15.4.1-x86_64-i386-64bit
INFO:snowflake.connector.connection:Connecting to GLOBAL Snowflake domain
  df = pd.read_sql(query, conn)
INFO:analysis.load_data:Loaded 1227 rows from tech_survey_extract.sql


Loaded 1227 survey responses
Columns: ['Timestamp', 'Age', 'Gender', 'Country', 'state', 'self_employed', 'family_history', 'treatment', 'work_interfere', 'no_employees', 'remote_work', 'tech_company', 'benefits', 'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave', 'mental_health_consequence', 'phys_health_consequence', 'coworkers', 'supervisor', 'mental_health_interview', 'phys_health_interview', 'mental_vs_physical', 'obs_consequence', 'survey_date']


Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,survey_date
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,2014-08-27 11:29:31
1,2014-08-27 11:29:37,44,Male,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,2014-08-27 11:29:37
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,2014-08-27 11:29:44
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,2014-08-27 11:29:46
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,2014-08-27 11:30:22


In [3]:
# EDA
print("Data types:")
print(tech_survey_df.dtypes)
print("\nMissing values:")
print(tech_survey_df.isnull().sum())
print("\nUnique values per column:")
for col in tech_survey_df.columns:
    print(f"{col}: {tech_survey_df[col].nunique()} unique values")

Data types:
Timestamp                    object
Age                           int64
Gender                       object
Country                      object
state                        object
self_employed                object
family_history               object
treatment                    object
work_interfere               object
no_employees                 object
remote_work                  object
tech_company                 object
benefits                     object
care_options                 object
wellness_program             object
seek_help                    object
anonymity                    object
leave                        object
mental_health_consequence    object
phys_health_consequence      object
coworkers                    object
supervisor                   object
mental_health_interview      object
phys_health_interview        object
mental_vs_physical           object
obs_consequence              object
survey_date                  object
dtype: object

M

In [3]:
# Define features for clustering
config = CLUSTERING_CONFIG['preprocessing']
categorical_cols = [col for col in config['categorical_columns'] if col in tech_survey_df.columns]
numeric_cols = [col for col in config['numeric_columns'] if col in tech_survey_df.columns]

print(f"Categorical features: {categorical_cols}")
print(f"Numeric features: {numeric_cols}")
print(f"Total features for clustering: {len(categorical_cols + numeric_cols)}")

Categorical features: ['Gender', 'Country', 'treatment', 'work_interfere', 'family_history', 'remote_work', 'mental_health_consequence', 'benefits']
Numeric features: ['Age']
Total features for clustering: 9


In [4]:
# Prepare features for clustering
features_df, label_encoders, scaler = prepare_clustering_features(
    tech_survey_df, 
    categorical_cols, 
    numeric_cols
)

print(f"Features prepared: {features_df.shape}")
print(f"Sample after preprocessing:")
print(features_df.head())

Features prepared: (968, 9)
Sample after preprocessing:
   Gender  Country  treatment  work_interfere  family_history  remote_work  \
0       0       37          1               1               0            0   
1       1       37          0               2               0            0   
2       1        6          0               2               0            0   
3       1       36          1               1               1            0   
4       1       37          0               0               0            1   

   mental_health_consequence  benefits       Age  
0                          1         2  0.626188  
1                          0         0  1.574144  
2                          1         1 -0.050923  
3                          2         1 -0.186346  
4                          1         2 -0.186346  


In [5]:
# Compute Gower distance
categorical_indices = list(range(len(categorical_cols)))
distance_matrix = compute_gower_distance(features_df, categorical_indices)
distance_matrix = distance_matrix.astype(np.float64)

print(f"Distance matrix shape: {distance_matrix.shape}")
print(f"Distance matrix sample (first 5x5):")
print(distance_matrix[:5, :5])

Distance matrix shape: (968, 968)
Distance matrix sample (first 5x5):
[[0.         0.56995887 0.56584364 0.56790125 0.45679012]
 [0.56995887 0.         0.35802469 0.69341564 0.4711934 ]
 [0.56584364 0.35802469 0.         0.55761319 0.44650206]
 [0.56790125 0.69341564 0.55761319 0.         0.77777779]
 [0.45679012 0.4711934  0.44650206 0.77777779 0.        ]]


In [6]:
# Run HDBSCAN clustering
clustering_config = CLUSTERING_CONFIG['hdbscan']

clusterer, cluster_labels = run_hdbscan_clustering(
    distance_matrix,
    min_cluster_size=clustering_config['min_cluster_size'],
    min_samples=clustering_config['min_samples']
)

# Check clustering results
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
n_noise = list(cluster_labels).count(-1)

print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise}")
print(f"Cluster labels distribution:")
unique, counts = np.unique(cluster_labels, return_counts=True)
for label, count in zip(unique, counts):
    print(f"  Cluster {label}: {count} points")

Number of clusters: 3
Number of noise points: 54
Cluster labels distribution:
  Cluster -1: 54 points
  Cluster 0: 31 points
  Cluster 1: 852 points
  Cluster 2: 31 points


In [7]:
# Evaluate clustering quality
if n_clusters >= 2:
    silhouette_avg = evaluate_clustering(distance_matrix, cluster_labels)
    print(f"Average silhouette score: {silhouette_avg:.3f}")
    
    # Cluster stability score from HDBSCAN
    print(f"Cluster persistence scores: {clusterer.cluster_persistence_}")
else:
    print("Insufficient clusters for evaluation")

Average silhouette score: 0.089
Cluster persistence scores: [0.         0.38620409 0.        ]


In [8]:
# Get original data for the clustered samples
# Match features_df indices back to original tech_survey_df
original_clustered = tech_survey_df.loc[features_df.index].copy()
original_clustered['cluster'] = cluster_labels

In [9]:
# Analyze cluster characteristics
print("CLUSTER CHARACTERISTICS")
print("="*60)

for cluster_id in sorted(original_clustered['cluster'].unique()):
    if cluster_id == -1:
        print(f"\nNoise Points: {sum(original_clustered['cluster'] == -1)} samples ({sum(original_clustered['cluster'] == -1)/len(original_clustered)*100:.1f}%)")
        continue
    
    cluster_data = original_clustered[original_clustered['cluster'] == cluster_id]
    print(f"\n{'='*60}")
    print(f"Cluster {cluster_id}: {len(cluster_data)} samples ({len(cluster_data)/len(original_clustered)*100:.1f}%)")
    print(f"{'='*60}")
    
    # Show characteristics with actual labels
    print(f"  Gender: {cluster_data['Gender'].mode()[0]} ({(cluster_data['Gender'] == cluster_data['Gender'].mode()[0]).sum()/len(cluster_data)*100:.0f}%)")
    print(f"  Country: {cluster_data['Country'].mode()[0]} ({(cluster_data['Country'] == cluster_data['Country'].mode()[0]).sum()/len(cluster_data)*100:.0f}%)")
    print(f"  Currently in treatment: {cluster_data['treatment'].mode()[0]} ({(cluster_data['treatment'] == cluster_data['treatment'].mode()[0]).sum()/len(cluster_data)*100:.0f}%)")
    print(f"  Work interference: {cluster_data['work_interfere'].mode()[0]} ({(cluster_data['work_interfere'] == cluster_data['work_interfere'].mode()[0]).sum()/len(cluster_data)*100:.0f}%)")
    print(f"  Family history: {cluster_data['family_history'].mode()[0]} ({(cluster_data['family_history'] == cluster_data['family_history'].mode()[0]).sum()/len(cluster_data)*100:.0f}%)")
    print(f"  Remote work: {cluster_data['remote_work'].mode()[0]} ({(cluster_data['remote_work'] == cluster_data['remote_work'].mode()[0]).sum()/len(cluster_data)*100:.0f}%)")
    print(f"  Mental health consequence: {cluster_data['mental_health_consequence'].mode()[0]} ({(cluster_data['mental_health_consequence'] == cluster_data['mental_health_consequence'].mode()[0]).sum()/len(cluster_data)*100:.0f}%)")
    print(f"  Benefits offered: {cluster_data['benefits'].mode()[0]} ({(cluster_data['benefits'] == cluster_data['benefits'].mode()[0]).sum()/len(cluster_data)*100:.0f}%)")
    print(f"  Average Age: {cluster_data['Age'].mean():.1f} years")

CLUSTER CHARACTERISTICS

Noise Points: 54 samples (5.6%)

Cluster 0: 31 samples (3.2%)
  Gender: Male (68%)
  Country: Canada (16%)
  Currently in treatment: Yes (65%)
  Work interference: Often (39%)
  Family history: Yes (55%)
  Remote work: Yes (55%)
  Mental health consequence: No (39%)
  Benefits offered: No (55%)
  Average Age: 29.8 years

Cluster 1: 852 samples (88.0%)
  Gender: Male (82%)
  Country: United States (68%)
  Currently in treatment: Yes (65%)
  Work interference: Sometimes (49%)
  Family history: No (55%)
  Remote work: No (72%)
  Mental health consequence: Maybe (38%)
  Benefits offered: Yes (44%)
  Average Age: 32.5 years

Cluster 2: 31 samples (3.2%)
  Gender: Male (52%)
  Country: United States (19%)
  Currently in treatment: Yes (52%)
  Work interference: Sometimes (35%)
  Family history: No (74%)
  Remote work: No (68%)
  Mental health consequence: Maybe (39%)
  Benefits offered: Don't know (42%)
  Average Age: 32.0 years


In [13]:
# Save clustering results

# Use consistent path (from project root)
models_dir = Path.cwd().parent.parent / 'analysis' / 'models' / 'saved_models' / 'clustering'
models_dir.mkdir(parents=True, exist_ok=True)

# Save the trained clusterer
model_filename = models_dir / f"hdbscan_clusterer_{datetime.now().strftime('%Y%m%d')}.pkl"
with open(model_filename, 'wb') as f:
    pickle.dump(clusterer, f)

# Save preprocessing artifacts
preprocessing_artifacts = {
    'label_encoders': label_encoders,
    'scaler': scaler,
    'categorical_cols': categorical_cols,
    'numeric_cols': numeric_cols,
    'feature_columns': list(features_df.columns)
}

artifacts_filename = models_dir / f"clustering_preprocessing_{datetime.now().strftime('%Y%m%d')}.pkl"
with open(artifacts_filename, 'wb') as f:
    pickle.dump(preprocessing_artifacts, f)

# Save cluster results and metadata
cluster_results = {
    'labels': cluster_labels,
    'indices': features_df.index.tolist(),
    'n_clusters': n_clusters,
    'n_noise': n_noise,
    'silhouette_score': silhouette_avg,
    'config': {
        'min_cluster_size': 25,
        'min_samples': 4
    },
    'cluster_profiles': {
        0: "Remote Workers with Severe Impact",
        1: "Mainstream Tech Workers", 
        2: "Uninformed/Uncertain Group"
    }
}

results_filename = models_dir / f"cluster_results_{datetime.now().strftime('%Y%m%d')}.pkl"
with open(results_filename, 'wb') as f:
    pickle.dump(cluster_results, f)

print(f"Model saved to: {model_filename}")
print(f"Preprocessing artifacts saved to: {artifacts_filename}")
print(f"Cluster results saved to: {results_filename}")
print(f"\nTotal files saved: 3")
print(f"Location: {models_dir}")

Model saved to: /Users/Andrew/Desktop/Computer Science/Mental_Health_Project/analysis/models/saved_models/clustering/hdbscan_clusterer_20251027.pkl
Preprocessing artifacts saved to: /Users/Andrew/Desktop/Computer Science/Mental_Health_Project/analysis/models/saved_models/clustering/clustering_preprocessing_20251027.pkl
Cluster results saved to: /Users/Andrew/Desktop/Computer Science/Mental_Health_Project/analysis/models/saved_models/clustering/cluster_results_20251027.pkl

Total files saved: 3
Location: /Users/Andrew/Desktop/Computer Science/Mental_Health_Project/analysis/models/saved_models/clustering


In [14]:
# Save results for analysis (human-readable format)
results_df = original_clustered.copy()  # Use decoded data, not encoded

output_dir = Path.cwd().parent.parent / 'analysis' / 'outputs' / 'results' / 'clustering'
output_dir.mkdir(parents=True, exist_ok=True)

output_filename = output_dir / f"tech_survey_clustered_{datetime.now().strftime('%Y%m%d')}.csv"
results_df.to_csv(output_filename, index=False)

print(f"Clustered results saved to: {output_filename}")
print(f"  Contains {len(results_df)} samples with original labels + cluster assignments")

Clustered results saved to: /Users/Andrew/Desktop/Computer Science/Mental_Health_Project/analysis/outputs/results/clustering/tech_survey_clustered_20251027.csv
  Contains 968 samples with original labels + cluster assignments
