In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

def optimize_kmeans(df, k_range=(2, 11), batch_size=100):
    """
    Apply and optimize K-means clustering on a given DataFrame.
    
    Parameters:
    - df: DataFrame, data for clustering
    - k_range: tuple, range of k values to try (inclusive)
    
    Returns:
    - dict, containing optimal k and metrics
    """
    # Initialize variables to store metrics
    k_values = []
    inertias = []
    silhouette_scores = []
    davies_bouldin_scores = []
    
    # Loop through different values of k to find the optimal one
    for k in range(k_range[0], k_range[1]):
        # Fit K-means model
        kmeans = MiniBatchKMeans(n_clusters=k, batch_size=batch_size, random_state=42).fit(df)
        
        # Get cluster labels
        labels = kmeans.labels_
        
        # Calculate metrics
        inertia = kmeans.inertia_
        silhouette = silhouette_score(df, labels)
        davies_bouldin = davies_bouldin_score(df, labels)
        
        # Store metrics
        k_values.append(k)
        inertias.append(inertia)
        silhouette_scores.append(silhouette)
        davies_bouldin_scores.append(davies_bouldin)
        
    # Finding the optimal k based on metrics
    # Lower inertia and Davies-Bouldin score is better. Higher silhouette score is better.
    optimal_k = k_values[np.argmin(inertias)]  # Change this based on the metric you prioritize
    
    # Compile metrics
    metrics = {
        'k_values': k_values,
        'inertias': inertias,
        'silhouette_scores': silhouette_scores,
        'davies_bouldin_scores': davies_bouldin_scores,
        'optimal_k': optimal_k
    }
    
    return metrics

In [4]:
df = pd.read_csv("./tmp_df.csv")



In [5]:
optimize_kmeans(df.values[0:1000])

: 