
# High-Speed Clustering Grid Search with Auto-Save

**Algorithms**: K-Means, K-Medoids (CLARA), and DBSCAN.
**Speed Optimization**: Uses sub-sampling for metric calculation and parallel processing.
**Safety**: Saves results to CSV after every iteration.


In [None]:

## 0. Imports and Setup


import os
import pandas as pd
import numpy as np
import time
from datetime import datetime
import warnings

# Algorithms
from sklearn.cluster import KMeans, DBSCAN
from sklearn_extra.cluster import KMedoids
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample

# Metrics
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

warnings.filterwarnings('ignore')

# Configuration
DATA_PATH = "../../data/processed/Fire/final.csv"
RESULTS_FILE = "clustering_grid_results.csv"
SAMPLE_SIZE_METRICS = 5000  # <--- HUGE SPEEDUP: Calculate metrics on this many points
RANDOM_STATE = 42


✅ Libraries loaded. Metrics will be calculated on samples for speed.


In [4]:

## 1. Data Loading & Preprocessing

def prepare_clustering_data(filepath):
    df = pd.read_csv(filepath)
    # Remove non-feature columns (adjust names if they differ in your file)
    exclude = ["fire", "latitude", "longitude", "Lat", "Lon"]
    cols = [c for c in df.columns if c not in exclude]
    
    X = df[cols].values
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled

X_scaled = prepare_clustering_data(DATA_PATH)
print(f"Data ready: {X_scaled.shape}")

# Pre-calculate a fixed sample for consistent evaluation speed
X_eval_sample = resample(X_scaled, n_samples=min(SAMPLE_SIZE_METRICS, len(X_scaled)), 
                         random_state=RANDOM_STATE)


Data ready: (104372, 15)


In [5]:

## 2. Robust Clustering Grid Search

from sklearn.model_selection import ParameterGrid

def run_clustering_search(model_class, param_grid, model_name, X_train, X_eval, output_file):
    print(f"\n" + "="*60)
    print(f"Starting Search: {model_name}")
    
    grid = list(ParameterGrid(param_grid))
    
    # Initialize CSV if not exists
    if not os.path.exists(output_file):
        pd.DataFrame(columns=['timestamp', 'model', 'params', 'silhouette', 'db_index', 'ch_score', 'n_clusters', 'time']).to_csv(output_file, index=False)

    for i, params in enumerate(grid):
        start_time = time.time()
        try:
            # 1. Initialize and Fit
            model = model_class(**params)
            
            # For CLARA/KMedoids, we often fit on a sample but predict on the eval set
            # For KMeans/DBSCAN, we fit on the whole or a large sample
            labels = model.fit_predict(X_train)
            
            # 2. Evaluation (On the fixed evaluation sample for speed)
            # We must predict labels for the evaluation sample specifically
            if hasattr(model, 'predict'):
                eval_labels = model.predict(X_eval)
            else:
                # For DBSCAN, predict doesn't exist, we must fit on the eval sample or find nearest
                eval_labels = model.fit_predict(X_eval)

            n_clusters = len(set(eval_labels)) - (1 if -1 in eval_labels else 0)
            
            # Metrics (only if we have more than 1 cluster and not all noise)
            if n_clusters > 1:
                sil = silhouette_score(X_eval, eval_labels)
                db = davies_bouldin_score(X_eval, eval_labels)
                ch = calinski_harabasz_score(X_eval, eval_labels)
            else:
                sil, db, ch = -1, 999, 0

        except Exception as e:
            print(f"Error at {params}: {e}")
            sil, db, ch, n_clusters = np.nan, np.nan, np.nan, 0

        elapsed = time.time() - start_time
        
        # 3. Save result
        res = {
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'model': model_name,
            'params': str(params),
            'silhouette': sil,
            'db_index': db,
            'ch_score': ch,
            'n_clusters': n_clusters,
            'time': elapsed
        }
        pd.DataFrame([res]).to_csv(output_file, mode='a', header=False, index=False)
        
        print(f"[{i+1}/{len(grid)}] Clusters: {n_clusters} | Sil: {sil:.3f} | Params: {params}")


In [6]:
## 3. Define Fast Grids & Execute

# K-MEANS GRID
kmeans_grid = {
    'n_clusters': range(2, 11),
    'n_init': [5],        # Reduced from 10 for speed
    'max_iter': [100],    # Reduced from 300 for speed
    'random_state': [42]
}

# K-MEDOIDS (CLARA approach)
# Note: We fit on a sample inside the loop for extreme speed
kmedoids_grid = {
    'n_clusters': range(2, 11),
    'method': ['pam'],
    'init': ['k-medoids++'],
    'random_state': [42]
}

# DBSCAN GRID
dbscan_grid = {
    'eps': [0.05, 0.1, 0.15, 0.2, 0.3, 0.5],
    'min_samples': [5, 10, 20, 50],
    'n_jobs': [-1] # Use all cores
}

In [7]:
# Execution
# For K-Medoids, we pass a smaller training sample to mimic CLARA's speed
X_train_small = resample(X_scaled, n_samples=10000, random_state=42)

In [8]:

run_clustering_search(KMeans, kmeans_grid, "K-Means", X_scaled, X_eval_sample, RESULTS_FILE)


Starting Search: K-Means
[1/9] Clusters: 2 | Sil: 0.478 | Params: {'max_iter': 100, 'n_clusters': 2, 'n_init': 5, 'random_state': 42}
[2/9] Clusters: 3 | Sil: 0.414 | Params: {'max_iter': 100, 'n_clusters': 3, 'n_init': 5, 'random_state': 42}
[3/9] Clusters: 4 | Sil: 0.330 | Params: {'max_iter': 100, 'n_clusters': 4, 'n_init': 5, 'random_state': 42}
[4/9] Clusters: 5 | Sil: 0.352 | Params: {'max_iter': 100, 'n_clusters': 5, 'n_init': 5, 'random_state': 42}
[5/9] Clusters: 6 | Sil: 0.347 | Params: {'max_iter': 100, 'n_clusters': 6, 'n_init': 5, 'random_state': 42}
[6/9] Clusters: 7 | Sil: 0.352 | Params: {'max_iter': 100, 'n_clusters': 7, 'n_init': 5, 'random_state': 42}
[7/9] Clusters: 8 | Sil: 0.351 | Params: {'max_iter': 100, 'n_clusters': 8, 'n_init': 5, 'random_state': 42}
[8/9] Clusters: 9 | Sil: 0.351 | Params: {'max_iter': 100, 'n_clusters': 9, 'n_init': 5, 'random_state': 42}
[9/9] Clusters: 10 | Sil: 0.300 | Params: {'max_iter': 100, 'n_clusters': 10, 'n_init': 5, 'random_sta

In [10]:

run_clustering_search(KMedoids, kmedoids_grid, "K-Medoids", X_train_small, X_eval_sample, RESULTS_FILE)


Starting Search: K-Medoids
[1/9] Clusters: 2 | Sil: 0.329 | Params: {'init': 'k-medoids++', 'method': 'pam', 'n_clusters': 2, 'random_state': 42}
[2/9] Clusters: 3 | Sil: 0.411 | Params: {'init': 'k-medoids++', 'method': 'pam', 'n_clusters': 3, 'random_state': 42}
[3/9] Clusters: 4 | Sil: 0.326 | Params: {'init': 'k-medoids++', 'method': 'pam', 'n_clusters': 4, 'random_state': 42}
[4/9] Clusters: 5 | Sil: 0.348 | Params: {'init': 'k-medoids++', 'method': 'pam', 'n_clusters': 5, 'random_state': 42}
[5/9] Clusters: 6 | Sil: 0.348 | Params: {'init': 'k-medoids++', 'method': 'pam', 'n_clusters': 6, 'random_state': 42}
[6/9] Clusters: 7 | Sil: 0.322 | Params: {'init': 'k-medoids++', 'method': 'pam', 'n_clusters': 7, 'random_state': 42}
[7/9] Clusters: 8 | Sil: 0.317 | Params: {'init': 'k-medoids++', 'method': 'pam', 'n_clusters': 8, 'random_state': 42}
[8/9] Clusters: 9 | Sil: 0.272 | Params: {'init': 'k-medoids++', 'method': 'pam', 'n_clusters': 9, 'random_state': 42}
[9/9] Clusters: 10 |

In [11]:

run_clustering_search(DBSCAN, dbscan_grid, "DBSCAN", X_scaled, X_eval_sample, RESULTS_FILE)


Starting Search: DBSCAN
[1/24] Clusters: 83 | Sil: 0.040 | Params: {'eps': 0.05, 'min_samples': 5, 'n_jobs': -1}
[2/24] Clusters: 41 | Sil: 0.015 | Params: {'eps': 0.05, 'min_samples': 10, 'n_jobs': -1}
[3/24] Clusters: 19 | Sil: 0.003 | Params: {'eps': 0.05, 'min_samples': 20, 'n_jobs': -1}
[4/24] Clusters: 6 | Sil: -0.172 | Params: {'eps': 0.05, 'min_samples': 50, 'n_jobs': -1}
[5/24] Clusters: 66 | Sil: 0.167 | Params: {'eps': 0.1, 'min_samples': 5, 'n_jobs': -1}
[6/24] Clusters: 37 | Sil: 0.178 | Params: {'eps': 0.1, 'min_samples': 10, 'n_jobs': -1}
[7/24] Clusters: 26 | Sil: 0.138 | Params: {'eps': 0.1, 'min_samples': 20, 'n_jobs': -1}
[8/24] Clusters: 7 | Sil: 0.122 | Params: {'eps': 0.1, 'min_samples': 50, 'n_jobs': -1}
[9/24] Clusters: 58 | Sil: 0.216 | Params: {'eps': 0.15, 'min_samples': 5, 'n_jobs': -1}
[10/24] Clusters: 35 | Sil: 0.205 | Params: {'eps': 0.15, 'min_samples': 10, 'n_jobs': -1}
[11/24] Clusters: 21 | Sil: 0.221 | Params: {'eps': 0.15, 'min_samples': 20, 'n_jo

In [14]:

## 4. View Best Results (Optimized for Davies-Bouldin)



results = pd.read_csv(RESULTS_FILE)

# IMPORTANT: We filter out cases where db_index is 999 (failed/single cluster) 
# and sort ASCENDING because for DB Index, lower is better.
valid_results = results[results['db_index'] < 999]

# Get the best (minimum) DB Index per model
best = valid_results.sort_values('db_index', ascending=True).groupby('model').head(1)

print("Best Parameters Found (Minimizing Davies-Bouldin Index):")
print("-" * 80)
# We include Silhouette just for comparison, but optimize for DB Index
print(best[['model', 'db_index', 'silhouette', 'n_clusters', 'params']])

# Detailed breakdown
for index, row in best.iterrows():
    print(f"\nBest for {row['model']}:")
    print(f"   • Davies-Bouldin Index: {row['db_index']:.4f}")
    print(f"   • Silhouette Score:     {row['silhouette']:.4f}")
    print(f"   • Number of Clusters:   {row['n_clusters']}")
    print(f"   • Optimal Parameters:    {row['params']}")


Best Parameters Found (Minimizing Davies-Bouldin Index):
--------------------------------------------------------------------------------
        model  db_index  silhouette  n_clusters  \
11  K-Medoids  0.852171    0.411218           3   
1     K-Means  0.933918    0.413515           3   
24     DBSCAN  0.999089    0.177614          37   

                                               params  
11  {'init': 'k-medoids++', 'method': 'pam', 'n_cl...  
1   {'max_iter': 100, 'n_clusters': 3, 'n_init': 5...  
24      {'eps': 0.1, 'min_samples': 10, 'n_jobs': -1}  

Best for K-Medoids:
   • Davies-Bouldin Index: 0.8522
   • Silhouette Score:     0.4112
   • Number of Clusters:   3
   • Optimal Parameters:    {'init': 'k-medoids++', 'method': 'pam', 'n_clusters': 3, 'random_state': 42}

Best for K-Means:
   • Davies-Bouldin Index: 0.9339
   • Silhouette Score:     0.4135
   • Number of Clusters:   3
   • Optimal Parameters:    {'max_iter': 100, 'n_clusters': 3, 'n_init': 5, 'random_state': 4