In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import pickle
import os
from datetime import datetime
sys.path.append('..')

from load_data import load_dataset
from utils.preprocessing import prepare_clustering_features, compute_gower_distance
from clustering.clustering_utils import run_hdbscan_clustering, evaluate_clustering
from config.model_config import CLUSTERING_CONFIG

ModuleNotFoundError: No module named 'pipeline'

In [None]:
# Load tech survey data
tech_survey_df = load_dataset('tech_survey')
print(f"Loaded {len(tech_survey_df)} survey responses")
print(f"Columns: {list(tech_survey_df.columns)}")
tech_survey_df.head()

In [None]:
# EDA
print("Data types:")
print(tech_survey_df.dtypes)
print("\nMissing values:")
print(tech_survey_df.isnull().sum())
print("\nUnique values per column:")
for col in tech_survey_df.columns:
    print(f"{col}: {tech_survey_df[col].nunique()} unique values")

In [None]:
# Define features for clustering
config = CLUSTERING_CONFIG['preprocessing']
categorical_cols = [col for col in config['categorical_columns'] if col in tech_survey_df.columns]
numeric_cols = [col for col in config['numeric_columns'] if col in tech_survey_df.columns]

print(f"Categorical features: {categorical_cols}")
print(f"Numeric features: {numeric_cols}")
print(f"Total features for clustering: {len(categorical_cols + numeric_cols)}")

In [None]:
# Prepare features for clustering
features_df, label_encoders, scaler = prepare_clustering_features(
    tech_survey_df, 
    categorical_cols, 
    numeric_cols
)

print(f"Features prepared: {features_df.shape}")
print(f"Sample after preprocessing:")
print(features_df.head())

In [None]:
# Compute Gower distance
categorical_indices = list(range(len(categorical_cols)))
distance_matrix = compute_gower_distance(features_df, categorical_indices)

print(f"Distance matrix shape: {distance_matrix.shape}")
print(f"Distance matrix sample (first 5x5):")
print(distance_matrix[:5, :5])

In [None]:
# Run HDBSCAN clustering
clustering_config = CLUSTERING_CONFIG['hdbscan']

clusterer, cluster_labels = run_hdbscan_clustering(
    distance_matrix,
    min_cluster_size=clustering_config['min_cluster_size'],
    min_samples=clustering_config['min_samples']
)

# Check clustering results
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
n_noise = list(cluster_labels).count(-1)

print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise}")
print(f"Cluster labels distribution:")
unique, counts = np.unique(cluster_labels, return_counts=True)
for label, count in zip(unique, counts):
    print(f"  Cluster {label}: {count} points")

In [None]:
# Evaluate clustering quality
if n_clusters >= 2:
    silhouette_avg = evaluate_clustering(distance_matrix, cluster_labels)
    print(f"Average silhouette score: {silhouette_avg:.3f}")
    
    # Cluster stability score from HDBSCAN
    print(f"Cluster persistence scores: {clusterer.cluster_persistence_}")
else:
    print("Insufficient clusters for evaluation")

In [None]:
# Visualize clustering results
if n_clusters >= 1:
    # Add cluster labels to original dataframe
    clustered_df = tech_survey_df.copy()
    clustered_df['cluster'] = cluster_labels
    
    # Plot cluster sizes
    plt.figure(figsize=(10, 6))
    cluster_counts = pd.Series(cluster_labels).value_counts().sort_index()
    cluster_counts.plot(kind='bar')
    plt.title('Cluster Size Distribution')
    plt.xlabel('Cluster Label (-1 = Noise)')
    plt.ylabel('Number of Points')
    plt.xticks(rotation=0)
    plt.show()

In [None]:
# Save clustering results

# Create models directory if it doesn't exist
os.makedirs('../models/saved_models', exist_ok=True)

# Save the trained clusterer
model_filename = f"../models/saved_models/hdbscan_clusterer_{datetime.now().strftime('%Y%m%d')}.pkl"
with open(model_filename, 'wb') as f:
    pickle.dump(clusterer, f)

# Save preprocessing artifacts
preprocessing_artifacts = {
    'label_encoders': label_encoders,
    'scaler': scaler,
    'categorical_cols': categorical_cols,
    'numeric_cols': numeric_cols,
    'feature_columns': list(features_df.columns)
}

artifacts_filename = f"../models/saved_models/clustering_preprocessing_{datetime.now().strftime('%Y%m%d')}.pkl"
with open(artifacts_filename, 'wb') as f:
    pickle.dump(preprocessing_artifacts, f)

print(f"Model saved to: {model_filename}")
print(f"Preprocessing artifacts saved to: {artifacts_filename}")

In [None]:
# Save results for analysis

# Save clustered dataframe
results_df = tech_survey_df.copy()
results_df['cluster'] = cluster_labels

output_filename = f"../outputs/results/tech_survey_clustered_{datetime.now().strftime('%Y%m%d')}.csv"
os.makedirs('../outputs/results', exist_ok=True)
results_df.to_csv(output_filename, index=False)

print(f"Clustered results saved to: {output_filename}")
print(f"Ready for cluster profiling analysis")