In [12]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

def optimize_kmeans(df, k_range=(2, 10), batch_size=100):
    """
    Apply and optimize K-means clustering on a given DataFrame.
    
    Parameters:
    - df: DataFrame, data for clustering
    - k_range: tuple, range of k values to try (inclusive)
    
    Returns:
    - dict, containing optimal k and metrics
    """
    # Initialize variables to store metrics
    k_values = []
    inertias = []
    silhouette_scores = []
    davies_bouldin_scores = []
    
    # Loop through different values of k to find the optimal one
    for k in range(k_range[0], k_range[1]):
        # Fit K-means model
        kmeans = MiniBatchKMeans(n_clusters=k, batch_size=batch_size, random_state=42, n_init = "auto").fit(df)
        
        # Get cluster labels
        labels = kmeans.labels_
        
        # Calculate metrics
        inertia = kmeans.inertia_
        silhouette = silhouette_score(df, labels)
        davies_bouldin = davies_bouldin_score(df, labels)
        
        # Store metrics
        k_values.append(k)
        inertias.append(inertia)
        silhouette_scores.append(silhouette)
        davies_bouldin_scores.append(davies_bouldin)
        
    # Finding the optimal k based on metrics
    # Lower inertia and Davies-Bouldin score is better. Higher silhouette score is better.
    optimal_k = k_values[np.argmin(davies_bouldin_scores)]  # Change this based on the metric you prioritize
    
    # Compile metrics
    metrics = {
        'k_values': k_values,
        'inertias': inertias,
        'silhouette_scores': silhouette_scores,
        'davies_bouldin_scores': davies_bouldin_scores,
        'optimal_k': optimal_k
    }
    
    return metrics

In [6]:
df = pd.read_csv("./tmp_df.csv")



In [13]:
optimize_kmeans(df.values)

{'k_values': [2, 3, 4, 5, 6, 7, 8, 9],
 'inertias': [5580215459.068349,
  2980555290.299206,
  2605368384.563317,
  2054319909.7121527,
  1762711123.3814275,
  1427178201.834301,
  1290949649.7131643,
  1131036086.2219656],
 'silhouette_scores': [0.37792773160802917,
  0.44088668752026383,
  0.35303762649102644,
  0.3895008816122427,
  0.33834170844895506,
  0.35558141195832255,
  0.3538421648401396,
  0.35724555573861294],
 'davies_bouldin_scores': [1.0854369679599212,
  0.7749927477959222,
  0.9639580257352263,
  0.79836038460895,
  0.8917744525765444,
  0.862895673919633,
  0.8257441803726653,
  0.8515972610711848],
 'optimal_k': 3}