## K-Means Clustering

### Set-Up

In [1]:
import polars as pl
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV

import plotly.graph_objects as go
import plotly.express as px

import pyarrow as pa

In [2]:
# define input path
input_path = "C:\\Users\\agste\\Angelos Work Projects\\RFM & Clustering Project\\Data_Lake\\rfm_data.parquet"

# load cleaned dataset
print("Loading cleaned dataset...")
df = pl.read_parquet(input_path)
print(df.head())

Loading cleaned dataset...
shape: (5, 5)
┌────────────┬────────────────────────┬─────────┬───────────┬──────────┐
│ CustomerID ┆ RFM_Segment            ┆ Recency ┆ Frequency ┆ Monetary │
│ ---        ┆ ---                    ┆ ---     ┆ ---       ┆ ---      │
│ i64        ┆ str                    ┆ i64     ┆ u32       ┆ f64      │
╞════════════╪════════════════════════╪═════════╪═══════════╪══════════╡
│ 14175      ┆ Potential_Big_Spenders ┆ 50      ┆ 3         ┆ 3222.3   │
│ 17069      ┆ Champions              ┆ 19      ┆ 9         ┆ 2709.7   │
│ 17962      ┆ Lost Customers         ┆ 133     ┆ 1         ┆ 102.41   │
│ 14044      ┆ Fading_Customers       ┆ 17      ┆ 4         ┆ 646.42   │
│ 13255      ┆ Potential_Big_Spenders ┆ 0       ┆ 2         ┆ 399.51   │
└────────────┴────────────────────────┴─────────┴───────────┴──────────┘


### K-Means Cluster

#### Find Optimal k

In [3]:
def find_optimal_k(
    df,
    id_col="CustomerID",
    features=["Recency", "Frequency", "Monetary"],
    metrics=["inertia", "silhouette_scores"],
    show_plots=True,
):
    """

    Function to determine the optimal number of clusters for KMeans using the Elbow Method and Silhouette Score.


    Parameters:

        df (pd.DataFrame): Input DataFrame containing customer data.
        id_col (str): Column name to be used as index.
        features (list): List of columns to use for clustering.
        metrics (list): Metrics to compute (options: "inertia", "silhouette_scores").
        show_plots (bool): Whether to display the plots.


    Returns:

        int: Optimal number of clusters based on the Elbow Method and Silhouette Score.
    """

    # Set index and select relevant features

    df_pandas = df.to_pandas().set_index(id_col)[features]

    # Standardize the data

    scaler = StandardScaler()

    df_scaled = scaler.fit_transform(df_pandas)

    # Determine the optimal k using the Elbow Method and Silhouette Score

    inertia = []

    silhouette_scores = []

    k_range = list(range(4, 10))  # Testing k from 4 to 10

    for k in k_range:

        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)

        kmeans.fit(df_scaled)

        if "inertia" in metrics:

            inertia.append(kmeans.inertia_)  # Inertia (Sum of Squared Distances)

        if "silhouette_scores" in metrics:

            silhouette_scores.append(
                silhouette_score(df_scaled, kmeans.labels_)
            )  # Silhouette Score

    # Plot Elbow Method

    if show_plots and "inertia" in metrics:

        fig = go.Figure()

        fig.add_trace(
            go.Scatter(
                x=k_range,
                y=inertia,
                mode="lines+markers",
                marker=dict(size=8, color="blue"),
                line=dict(width=2),
                name="Inertia",
            )
        )

        fig.update_layout(
            title="Elbow Method for Optimal k",
            xaxis_title="Number of Clusters (k)",
            yaxis_title="Inertia",
            template="plotly_white",
            hovermode="x",
        )

        fig.show()

    # Plot Silhouette Score

    if show_plots and "silhouette_scores" in metrics:

        fig = go.Figure()

        fig.add_trace(
            go.Scatter(
                x=k_range,
                y=silhouette_scores,
                mode="lines+markers",
                marker=dict(size=8, color="green"),
                line=dict(width=2),
                name="Silhouette Score",
            )
        )

        fig.update_layout(
            title="Silhouette Score Analysis",
            xaxis_title="Number of Clusters (k)",
            yaxis_title="Silhouette Score",
            template="plotly_white",
            hovermode="x",
        )

        fig.show()

    # Determine the optimal number of clusters

    if "inertia" in metrics and "silhouette_scores" in metrics:

        optimal_k = (
            k_range[inertia.index(min(inertia))]
            if min(inertia) < max(silhouette_scores)
            else k_range[silhouette_scores.index(max(silhouette_scores))]
        )
    elif "inertia" in metrics:

        optimal_k = k_range[inertia.index(min(inertia))]

    elif "silhouette_scores" in metrics:

        optimal_k = k_range[silhouette_scores.index(max(silhouette_scores))]

    else:

        raise ValueError("No valid metric provided for optimal k selection")

    return optimal_k

In [4]:
find_optimal_k(
    df,
    id_col="CustomerID",
    features=["Recency", "Frequency", "Monetary"],
    metrics=["silhouette_scores", "inertia"],
    show_plots=True,
)

4

In [None]:
def grid_search_kmeans(
    df,
    id_col="CustomerID",
    features=["Recency", "Frequency", "Monetary"],
    param_grid=None,
    cv=5,
):

    """
    Perform grid search for K-Means clustering parameters and minimize the silhouette score.


    Parameters:
        df (pd.DataFrame): Input DataFrame containing customer data.
        id_col (str): Column name to be used as index.
        features (list): List of columns to use for clustering.
        param_grid (dict): Dictionary containing the hyperparameter grid for KMeans.
        cv (int): Number of cross-validation folds (default=5).


    Returns:

        dict: Best hyperparameters found through grid search.

    """

    if param_grid is None:
        param_grid = {
            "n_clusters": [4, 5, 6, 7, 8, 9, 10],
            "n_init": [10, 20, 50, 100, 200, 500],
            "max_iter": [250, 500, 1000, 1500],

        }


    # set index and select relevant features
    df_pandas = df.to_pandas().set_index(id_col)[features]


    # standardize the data
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_pandas)


    # define a custom scoring function that calculates silhouette score
    def silhouette_scorer(estimator, X):
        labels = estimator.fit_predict(X)
        return silhouette_score(X, labels)


    # Create a KMeans instance
    kmeans = KMeans(random_state=42)


    # perform grid search with custom scoring function
    grid_search = GridSearchCV(
        kmeans, param_grid, scoring=silhouette_scorer, cv=cv, n_jobs=-1
    )
    grid_search.fit(df_scaled)


    # get the best hyperparameters
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_


    # get the maximum silhouette score
    best_labels = best_estimator.fit_predict(df_scaled)
    max_silhouette_score = silhouette_score(df_scaled, best_labels)


    return best_params, best_estimator, max_silhouette_score

In [6]:
# example if param_grid is not None - define a custom parameter grid
# custom_param_grid = {
#    "n_clusters": [3, 4, 5, 6],  # Only testing 4 cluster values
#    "n_init": [10, 50],  # Trying fewer `n_init` values
#    "max_iter": [500, 1000],  # Testing two `max_iter` values
#    "tol": [1e-3],  # Fixing tolerance to a single value
#    "algorithm": ["lloyd", "elkan"],  # Only using "elkan" optimization
#    "init": ["k-means++", "random"]  # Only using "k-means++" initialization
# }

In [8]:
best_params, best_estimator, best_score = grid_search_kmeans(
    df,
    id_col="CustomerID",
    features=["Recency", "Frequency", "Monetary"],
    param_grid=None,
    cv=5,
)

In [None]:
def fit_kmeans(df, id_col, features, best_params):
    """
    Fits KMeans with the best parameters and returns the dataset with cluster labels.

    Parameters:
        df (pl.DataFrame): The dataset.
        id_col (str): The column to set as index.
        features (list): The feature columns to use.
        best_params (dict): Best parameters for KMeans.

    Returns:
        pd.DataFrame: Data with cluster labels.
    """

    # select relevant features
    df_pandas = df.to_pandas().set_index(id_col)[features]

    # standardize data
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_pandas)

    # fit KMeans
    kmeans = KMeans(**best_params, random_state=42)
    df_pandas["Cluster"] = kmeans.fit_predict(df_scaled)
    df_pandas = df_pandas.reset_index() # reset index
    df = pl.from_pandas(df_pandas) # convert back to polars

    # return dataframe
    return df

In [None]:
# fit kmeans clustering algorithm on the best parameters selected from GridSearch
k_means_output = fit_kmeans(
    df, "CustomerID", ["Recency", "Frequency", "Monetary"], best_params
)
k_means_output.head()

CustomerID,Recency,Frequency,Monetary,Cluster
i64,i64,u32,f64,i32
14175,50,3,3222.3,2
17069,19,9,2709.7,2
17962,133,1,102.41,2
14044,17,4,646.42,2
13255,0,2,399.51,2


In [None]:
# overview of cluster features to assign cluster names
k_means_output.group_by("Cluster").agg(
    pl.len().alias("Count"),
    pl.col("Recency").mean().round(2),
    pl.col("Frequency").mean().round(2),
    pl.col("Monetary").mean().round(2),
).sort("Monetary", descending=True)

Cluster,Count,Recency,Frequency,Monetary
i32,u32,f64,f64,f64
1,11,5.36,90.36,121248.73
0,196,16.05,21.57,12703.14
2,2988,40.92,3.59,1300.49
3,1098,240.2,1.54,469.58


In [14]:
# define the cluster-to-name mapping
cluster_names = {
    0: "Elite_Spenders",  
    1: "VIP_Customers", 
    2: "Steady_Buyers",
    3: "One_Time_Shoppers",
}

# assign cluster names
k_means_output = k_means_output.with_columns(
    pl.col("Cluster")
    .map_elements(lambda x: cluster_names.get(x, "Unknown"), return_dtype=pl.Utf8)
    .alias("K_Means_Cluster")
)

In [None]:
# create k-means summary table
k_means_summary = (
    k_means_output.group_by("K_Means_Cluster")
    .agg(
        pl.len().alias("Count"),
        pl.col("Recency").mean().round(2),
        pl.col("Frequency").mean().round(2),
        pl.col("Monetary").mean().round(2),
    )
    .sort("Count")
)
k_means_summary

K_Means_Cluster,Count,Recency,Frequency,Monetary
str,u32,f64,f64,f64
"""VIP_Customers""",11,5.36,90.36,121248.73
"""Elite_Spenders""",196,16.05,21.57,12703.14
"""One_Time_Shoppers""",1098,240.2,1.54,469.58
"""Steady_Buyers""",2988,40.92,3.59,1300.49


In [None]:
# k_means_output.filter(pl.col("K_Means_Cluster").eq("VIP_Customers")).sort("CustomerID")

In [16]:
# Creating the first plot: Count vs. Monetary value per segment
fig1 = px.bar(
    k_means_summary,
    x="K_Means_Cluster",
    y="Monetary",
    color="K_Means_Cluster",
    title="Monetary Value per RFM Segment (Colored by Count)",
    labels={"Monetary": "Total Monetary Value", "K_Means_Cluster": "Customer Segment"},
    text="Monetary",
)
fig1.show()

In [17]:
# create new column: average basket size
k_means_summary = k_means_summary.with_columns(
    pl.col("Monetary")
    .truediv(pl.col("Frequency"))
    .round(2)
    .alias("Average_Basket_Value")
)

In [19]:
# Creating the fourth plot: Frequency to Monetary Ratio per segment
fig2 = px.bar(
    k_means_summary,
    x="K_Means_Cluster",
    y="Average_Basket_Value",
    color="K_Means_Cluster",
    title="Frequency to Monetary Ratio per Cluster",
    labels={
        "Average_Basket_Value": "Average_Basket_Value",
        "K_Means_Cluster": "Customer Segment",
    },
    text="Average_Basket_Value",
)
fig2.show()