In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import pickle
import os
from datetime import datetime
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

from analysis.load_data import load_dataset
from analysis.utils.preprocessing import prepare_clustering_features, compute_gower_distance
from analysis.utils.visualization import plot_cluster_distribution
from analysis.clustering.clustering_utils import run_hdbscan_clustering, evaluate_clustering, generate_cluster_profiles
from analysis.config.model_config import CLUSTERING_CONFIG

In [2]:
# Load tech survey data
tech_survey_df = load_dataset('tech_survey')
print(f"Loaded {len(tech_survey_df)} survey responses")
print(f"Columns: {list(tech_survey_df.columns)}")
tech_survey_df.head()

INFO:botocore.credentials:Found credentials in environment variables.
INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.18.0, Python Version: 3.11.13, Platform: macOS-15.4.1-x86_64-i386-64bit
INFO:snowflake.connector.connection:Connecting to GLOBAL Snowflake domain
  df = pd.read_sql(query, conn)
INFO:analysis.load_data:Loaded 1227 rows from tech_survey_extract.sql


Loaded 1227 survey responses
Columns: ['Timestamp', 'Age', 'Gender', 'Country', 'state', 'self_employed', 'family_history', 'treatment', 'work_interfere', 'no_employees', 'remote_work', 'tech_company', 'benefits', 'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave', 'mental_health_consequence', 'phys_health_consequence', 'coworkers', 'supervisor', 'mental_health_interview', 'phys_health_interview', 'mental_vs_physical', 'obs_consequence', 'survey_date']


Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,survey_date
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,2014-08-27 11:29:31
1,2014-08-27 11:29:37,44,Male,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,2014-08-27 11:29:37
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,2014-08-27 11:29:44
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,2014-08-27 11:29:46
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,2014-08-27 11:30:22


In [3]:
# EDA
print("Data types:")
print(tech_survey_df.dtypes)
print("\nMissing values:")
print(tech_survey_df.isnull().sum())
print("\nUnique values per column:")
for col in tech_survey_df.columns:
    print(f"{col}: {tech_survey_df[col].nunique()} unique values")

Data types:
Timestamp                    object
Age                           int64
Gender                       object
Country                      object
state                        object
self_employed                object
family_history               object
treatment                    object
work_interfere               object
no_employees                 object
remote_work                  object
tech_company                 object
benefits                     object
care_options                 object
wellness_program             object
seek_help                    object
anonymity                    object
leave                        object
mental_health_consequence    object
phys_health_consequence      object
coworkers                    object
supervisor                   object
mental_health_interview      object
phys_health_interview        object
mental_vs_physical           object
obs_consequence              object
survey_date                  object
dtype: object

M

In [4]:
# Define features for clustering
config = CLUSTERING_CONFIG['preprocessing']
categorical_cols = [col for col in config['categorical_columns'] if col in tech_survey_df.columns]
numeric_cols = [col for col in config['numeric_columns'] if col in tech_survey_df.columns]

print(f"Categorical features: {categorical_cols}")
print(f"Numeric features: {numeric_cols}")
print(f"Total features for clustering: {len(categorical_cols + numeric_cols)}")

Categorical features: ['Gender', 'Country', 'treatment', 'work_interfere', 'family_history', 'remote_work', 'mental_health_consequence', 'benefits']
Numeric features: ['Age']
Total features for clustering: 9


In [5]:
# Prepare features for clustering
features_df, label_encoders, scaler, filtered_original_df = prepare_clustering_features(
    tech_survey_df, 
    categorical_cols, 
    numeric_cols
)

print(f"Features prepared: {features_df.shape}")
print(f"Sample after preprocessing:")
print(features_df.head())

Features prepared: (968, 9)
Sample after preprocessing:
   Gender  Country  treatment  work_interfere  family_history  remote_work  \
0       0       37          1               1               0            0   
1       1       37          0               2               0            0   
2       1        6          0               2               0            0   
3       1       36          1               1               1            0   
4       1       37          0               0               0            1   

   mental_health_consequence  benefits       Age  
0                          1         2  0.626188  
1                          0         0  1.574144  
2                          1         1 -0.050923  
3                          2         1 -0.186346  
4                          1         2 -0.186346  


In [6]:
# Compute Gower distance
categorical_indices = list(range(len(categorical_cols)))
distance_matrix = compute_gower_distance(features_df, categorical_indices)
distance_matrix = distance_matrix.astype(np.float64)

print(f"Distance matrix shape: {distance_matrix.shape}")
print(f"Distance matrix sample (first 5x5):")
print(distance_matrix[:5, :5])

Distance matrix shape: (968, 968)
Distance matrix sample (first 5x5):
[[0.         0.56995887 0.56584364 0.56790125 0.45679012]
 [0.56995887 0.         0.35802469 0.69341564 0.4711934 ]
 [0.56584364 0.35802469 0.         0.55761319 0.44650206]
 [0.56790125 0.69341564 0.55761319 0.         0.77777779]
 [0.45679012 0.4711934  0.44650206 0.77777779 0.        ]]


In [7]:
# Run HDBSCAN clustering
clustering_config = CLUSTERING_CONFIG['hdbscan']

clusterer, cluster_labels = run_hdbscan_clustering(
    distance_matrix,
    min_cluster_size=clustering_config['min_cluster_size'],
    min_samples=clustering_config['min_samples'],
    cluster_selection_epsilon=clustering_config.get('cluster_selection_epsilon', 0.0)
)

# Check clustering results
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
n_noise = list(cluster_labels).count(-1)

print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise}")
print(f"Cluster labels distribution:")
unique, counts = np.unique(cluster_labels, return_counts=True)
for label, count in zip(unique, counts):
    print(f"  Cluster {label}: {count} points")

Number of clusters: 3
Number of noise points: 54
Cluster labels distribution:
  Cluster -1: 54 points
  Cluster 0: 31 points
  Cluster 1: 852 points
  Cluster 2: 31 points


  warn(


In [8]:
# Evaluate clustering quality
if n_clusters >= 2:
    evaluation_results = evaluate_clustering(clusterer, distance_matrix, cluster_labels)
    silhouette_avg = evaluation_results['silhouette_score']
    persistence_scores = evaluation_results['persistence_scores']

    print(f"\nEvaluation Metrics:")
    print(f"Silhouette Score: {silhouette_avg:.3f}")
    print(f"Cluster Persistence Scores: {persistence_scores}")

INFO:analysis.clustering.clustering_utils:Cluster persistence scores: [0.0, 0.3862040862386426, 0.0]



Evaluation Metrics:
Silhouette Score: 0.089
Cluster Persistence Scores: [0.0, 0.3862040862386426, 0.0]


In [9]:
# Generate cluster profiles
cluster_profiles = generate_cluster_profiles(filtered_original_df, cluster_labels)

In [10]:
# Create comprehensive results dictionary
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

cluster_results = {
    # Model outputs
    'labels': cluster_labels,
    
    # Evaluation metrics
    'silhouette_score': silhouette_avg,
    'cluster_persistence': persistence_scores,
    'n_clusters': len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0),
    'n_noise': int(np.sum(cluster_labels == -1)),
    
    # Full cluster profiles
    'cluster_profiles': cluster_profiles,
    
    # Metadata
    'timestamp': timestamp,
    'model_params': CLUSTERING_CONFIG['hdbscan']
}

# Save all artifacts
models_dir = PROJECT_ROOT / 'analysis/models/saved_models/clustering'
models_dir.mkdir(parents=True, exist_ok=True)

# Save clusterer
clusterer_path = models_dir / f'hdbscan_clusterer_{timestamp}.pkl'
with open(clusterer_path, 'wb') as f:
    pickle.dump(clusterer, f)
print(f"  Saved clusterer to {clusterer_path}")

# Save preprocessing artifacts
preprocessing_path = models_dir / f'clustering_preprocessing_{timestamp}.pkl'
preprocessing_artifacts = {
    'label_encoders': label_encoders,
    'scaler': scaler,
    'categorical_columns': CLUSTERING_CONFIG['preprocessing']['categorical_columns'],
    'numeric_columns': CLUSTERING_CONFIG['preprocessing']['numeric_columns']
}
with open(preprocessing_path, 'wb') as f:
    pickle.dump(preprocessing_artifacts, f)
print(f"  Saved preprocessing to {preprocessing_path}")

# Save results with profiles
results_path = models_dir / f'cluster_results_{timestamp}.pkl'
with open(results_path, 'wb') as f:
    pickle.dump(cluster_results, f)
print(f"  Saved cluster results with FULL PROFILES to {results_path}")

# Also save human-readable CSV
output_dir = PROJECT_ROOT / 'analysis/outputs/results/clustering'
output_dir.mkdir(parents=True, exist_ok=True)

clustered_data = filtered_original_df.copy()
clustered_data['cluster'] = cluster_labels
csv_path = output_dir / f'tech_survey_clustered_{timestamp}.csv'
clustered_data.to_csv(csv_path, index=False)
print(f"  Saved clustered data to {csv_path}")

print(f"\n{'='*80}")
print("CLUSTERING COMPLETE!")
print(f"{'='*80}")
print(f"Total samples: {len(cluster_labels)}")
print(f"Clusters found: {cluster_results['n_clusters']}")
print(f"Noise points: {cluster_results['n_noise']} ({cluster_results['n_noise']/len(cluster_labels)*100:.1f}%)")
print(f"Silhouette score: {silhouette_avg:.3f}")
print(f"\nArtifacts saved to: {models_dir}")

  Saved clusterer to /Users/Andrew/Desktop/Computer Science/MindPulseAI/analysis/models/saved_models/clustering/hdbscan_clusterer_20260104_014538.pkl
  Saved preprocessing to /Users/Andrew/Desktop/Computer Science/MindPulseAI/analysis/models/saved_models/clustering/clustering_preprocessing_20260104_014538.pkl
  Saved cluster results with FULL PROFILES to /Users/Andrew/Desktop/Computer Science/MindPulseAI/analysis/models/saved_models/clustering/cluster_results_20260104_014538.pkl
  Saved clustered data to /Users/Andrew/Desktop/Computer Science/MindPulseAI/analysis/outputs/results/clustering/tech_survey_clustered_20260104_014538.csv

CLUSTERING COMPLETE!
Total samples: 968
Clusters found: 3
Noise points: 54 (5.6%)
Silhouette score: 0.089

Artifacts saved to: /Users/Andrew/Desktop/Computer Science/MindPulseAI/analysis/models/saved_models/clustering
