In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Set pandas display option for maximum columns
pd.set_option('display.max_columns', 50)

def optimize_kmeans(df, k_range=(2, 10), batch_size=100):
    """
    Apply and optimize K-means clustering on a given DataFrame.
    
    Parameters:
    - df: DataFrame, data for clustering
    - k_range: tuple, range of k values to try (inclusive)
    
    Returns:
    - dict, containing optimal k and metrics
    """
    # Initialize variables to store metrics
    k_values = []
    inertias = []
    silhouette_scores = []
    davies_bouldin_scores = []
    
    # Loop through different values of k to find the optimal one
    for k in range(k_range[0], k_range[1]):
        # Fit K-means model
        kmeans = KMeans(n_clusters=k, random_state=42, n_init = "auto").fit(df)
        
        # Get cluster labels
        labels = kmeans.labels_
        
        # Calculate metrics
        inertia = kmeans.inertia_
        silhouette = silhouette_score(df, labels)
        davies_bouldin = davies_bouldin_score(df, labels)
        
        # Store metrics
        k_values.append(k)
        inertias.append(inertia)
        silhouette_scores.append(silhouette)
        davies_bouldin_scores.append(davies_bouldin)
        
    # Finding the optimal k based on metrics
    # Lower inertia and Davies-Bouldin score is better. Higher silhouette score is better.
    optimal_k = k_values[np.argmin(davies_bouldin_scores)]  # Change this based on the metric you prioritize
    
    # Compile metrics
    metrics = {
        'k_values': k_values,
        'inertias': inertias,
        'silhouette_scores': silhouette_scores,
        'davies_bouldin_scores': davies_bouldin_scores,
        'optimal_k': optimal_k
    }
    
    return metrics

In [3]:
def optimize_dbscan(df, eps_range=(0.1, 1.0, 0.1), min_samples_range=(3, 10)):
    """
    Apply and optimize DBSCAN clustering on a given DataFrame.
    
    Parameters:
    - df: DataFrame, data for clustering
    - eps_range: tuple, range of eps values to try (start, stop, step)
    - min_samples_range: tuple, range of min_samples values to try (start, stop)
    
    Returns:
    - dict, containing optimal eps, min_samples and metrics
    """
    # Initialize variables to store metrics
    eps_values = []
    min_samples_values = []
    silhouette_scores = []
    davies_bouldin_scores = []
    number_of_labels = []
    
    # Loop through different values of eps and min_samples to find the optimal ones
    for eps in np.arange(*eps_range):
        for min_samples in range(*min_samples_range):
            # Fit DBSCAN model
            dbscan = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1)
            labels = dbscan.fit_predict(df)
            
            # Only calculate metrics if more than one cluster is found
            if len(set(labels)) > 1:
                silhouette = silhouette_score(df, labels)
                davies_bouldin = davies_bouldin_score(df, labels)
                
                # Store metrics
                eps_values.append(eps)
                min_samples_values.append(min_samples)
                silhouette_scores.append(silhouette)
                davies_bouldin_scores.append(davies_bouldin)
                number_of_labels.append(len(set(labels)))
    
    # Finding the optimal eps and min_samples based on metrics
    # Lower Davies-Bouldin score is better. Higher silhouette score is better.
    optimal_index = np.argmin(davies_bouldin_scores)  # Change this based on the metric you prioritize
    optimal_eps = eps_values[optimal_index]
    optimal_min_samples = min_samples_values[optimal_index]
    optimal_labels = number_of_labels[optimal_index]

    # Compile metrics
    metrics = {
        'eps_values': eps_values,
        'min_samples_values': min_samples_values,
        'silhouette_scores': silhouette_scores,
        'davies_bouldin_scores': davies_bouldin_scores,
        'optimal_eps': optimal_eps,
        'optimal_min_samples': optimal_min_samples,
        'optimal_labels': optimal_labels
    }
    
    return metrics

In [4]:
import pandas as pd

DATA_FILE = "/workspaces/OpenClassroom--Machine-Learning-Engineer/P4/data/data_cleaned.csv.gz"

df = pd.read_csv(DATA_FILE)

df

Unnamed: 0,CustomerID,UniqueCustomerID,CustomerZipCodePrefix,CustomerCity,CustomerState,OrderStatus,RecencyClass,Recency,Frequency,FrequencyClass,OrderID,PaymentInstallments,TotalPaymentValue,TotalPaymentValueClass,MeanProductPrice,MeanFreightValue,AverageProductWeightG,AverageProductLengthCM,AverageProductHeightCM,AverageProductWidthCM,NumberOfProductsInOrder
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,delivered,Low,2350,1,Low,00e7ee1b050b8499577073aeb2a297a1,2.0,146.87,Low,119.362881,31.543220,8683.0,54.0,64.0,31.0,1.0
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP,delivered,Medium,2109,1,Low,29150127e6685892b6eab3eec79f59c7,8.0,335.48,Low,291.185319,41.179787,10150.0,89.0,15.0,40.0,1.0
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP,delivered,High,1982,1,Low,b2059ed67ce144a36e2aa97d2c9e9ad2,7.0,157.73,Low,143.958000,42.509000,8267.0,52.0,52.0,17.0,1.0
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP,delivered,High,2049,1,Low,951670f92359f4fe4a63112aa7306eba,1.0,173.30,Low,164.758095,37.736667,12160.0,56.0,51.0,28.0,1.0
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP,delivered,High,1912,1,Low,6b7d50bd145f6fc7f33cebabd7e49d0f,8.0,252.25,Low,230.000000,35.017500,5200.0,45.0,15.0,35.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98644,17ddf5dd5d51696bb3d7c6291687be6f,1a29b476fee25c95fbafc67c5ac95cf8,3937,sao paulo,SP,delivered,High,2024,1,Low,6760e20addcf0121e9d58f2f1ff14298,6.0,88.78,Low,78.900000,17.278000,611.0,22.0,22.0,23.0,1.0
98645,e7b71a9017aa05c9a7fd292d714858e8,d52a67c98be1cf6a5c84435bd38d095d,6764,taboao da serra,SP,delivered,High,2028,1,Low,9ec0c8947d973db4f4e8dcf1fbfa8f1b,3.0,129.06,Low,106.914286,24.420000,1211.0,25.0,24.0,22.0,1.0
98646,5e28dfe12db7fb50a4b2f691faecea5e,e9f50caf99f032f0bf3c55141f019d99,60115,fortaleza,CE,delivered,High,2023,1,Low,fed4434add09a6f332ea398efd656a5c,5.0,56.04,Low,37.000000,18.050000,870.0,25.0,20.0,18.0,1.0
98647,56b18e2166679b8a959d72dd06da27f9,73c2643a0a458b49f58cea58833b192e,92120,canoas,RS,delivered,Medium,2179,1,Low,e31ec91cea1ecf97797787471f98a8c2,2.0,711.07,Low,644.988444,31.671333,710.0,19.0,13.0,14.0,1.0


In [None]:
kmeans_results = optimize_kmeans(df[["Recency", "Frequency", "PaymentInstallments", "MeanProductPrice", "MeanFreightValue", "AverageProductWeightG", "AverageProductLengthCM", "AverageProductHeightCM", "AverageProductWidthCM", "NumberOfProductsInOrder"]].values)
kmeans_results

In [None]:
dbscan_results = optimize_dbscan(df[["Recency", "Frequency", "PaymentInstallments", "MeanProductPrice", "MeanFreightValue", "AverageProductWeightG", "AverageProductLengthCM", "AverageProductHeightCM", "AverageProductWidthCM", "NumberOfProductsInOrder"]].values)
dbscan_results

In [None]:
from sklearn.preprocessing import MinMaxScaler

kmeans_results = pd.DataFrame(kmeans_results).drop(columns="optimal_k")
kmeans_results["inertias"] = MinMaxScaler().fit_transform(kmeans_results["inertias"].values.reshape(-1, 1))

In [None]:
kmeans_results["silhouette_scores"] = MinMaxScaler().fit_transform(kmeans_results["silhouette_scores"].values.reshape(-1, 1))
kmeans_results["davies_bouldin_scores"] = MinMaxScaler().fit_transform(kmeans_results["davies_bouldin_scores"].values.reshape(-1, 1))
kmeans_results

In [None]:
import plotly.express as px

px.line(kmeans_results.melt(id_vars="k_values"), 
        x = "k_values",
        y = "value",
        color="variable",)

In [None]:
from sklearn.decomposition import PCA
km = MiniBatchKMeans(n_clusters=2, random_state=42, n_init="auto", batch_size=100).fit(df[["Recency", "Frequency", "PaymentInstallments", "MeanProductPrice", "MeanFreightValue", "AverageProductWeightG", "AverageProductLengthCM", "AverageProductHeightCM", "AverageProductWidthCM", "NumberOfProductsInOrder"]].values)
pca = PCA(n_components=2, random_state=42).fit(df[["Recency", "Frequency", "PaymentInstallments", "MeanProductPrice", "MeanFreightValue", "AverageProductWeightG", "AverageProductLengthCM", "AverageProductHeightCM", "AverageProductWidthCM", "NumberOfProductsInOrder"]].values)
#df[["Recency", "Frequency", "PaymentInstallments", "MeanProductPrice", "MeanFreightValue", "AverageProductWeightG", "AverageProductLengthCM", "AverageProductHeightCM", "AverageProductWidthCM", "NumberOfProductsInOrder"]].values

In [None]:
tmp = pca.transform(df[["Recency", "Frequency", "PaymentInstallments", "MeanProductPrice", "MeanFreightValue", "AverageProductWeightG", "AverageProductLengthCM", "AverageProductHeightCM", "AverageProductWidthCM", "NumberOfProductsInOrder"]].values)

In [None]:
tmp_df = pd.DataFrame({"labels": km.labels_, "dim 1": tmp[:, 0], "dim 2": tmp[:, 1]})
tmp_df

In [None]:
px.scatter(tmp_df, x="dim 1", y="dim 2", color="labels")