In [111]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import hdbscan


file_path = r"C:\Users\Anzer\Desktop\dim_user_features.csv"


In [114]:
df = pd.read_csv(file_path)
df.columns
df = df.fillna(0)

In [115]:

### Defining feature columns

feature_active_cols = [
    # demographics
   "age",
   "gender_male",
   "device_android",

    # geography
    "location_urban",
    "location_rural",
  #  "location_tail",

    # transactions
    "recency_days",
    "total_transactions",
    "median_trx_amount",
   "credit_card_usage",

    # promo behavior
   "promo_used_count",
   "promo_buying_period_ratio",

    # product mix
   "multi_quantity_ratio",
   "male_product_ratio",
   "female_product_ratio",
   "unisex_product_ratio",
   "free_items_ratio",
   "accessories_product_ratio",
   "footwear_product_ratio",
    "sporting_goods_product_ratio",
   "apparel_product_ratio",
   "home_product_ratio",
   "personal_care_product_ratio",

    # behavior
    "avg_events_per_booking",
   "avg_first_event_booking_to_sec",
   "avg_hibrnate_events",
   "avg_search_events",
   "avg_scroll_events",
   "avg_promo_page_events",

    # logistics
    "ship_hours_diff"
]

In [116]:

X = df[feature_active_cols]


X.shape


(100000, 29)

In [117]:
# log transform heavy features (VERY IMPORTANT)
for col in ["total_transactions", "median_trx_amount", "avg_events_per_booking"]:
    if col in X.columns:
        X[col] = np.log1p(X[col])

X = X.fillna(0)

X_scaled = StandardScaler().fit_transform(X)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = np.log1p(X[col])


In [127]:

clusterer = hdbscan.HDBSCAN(
    min_cluster_size=3000,      # try 30–100 depending on dataset size
    min_samples=150,           # controls noise strictness
    metric="euclidean",
    cluster_selection_method="eom"
)

labels = clusterer.fit_predict(X_scaled)

df["cluster_hdbscan"] = labels


In [128]:
print(df["cluster_hdbscan"].value_counts())

cluster_hdbscan
 0    33639
-1    17083
 4    14850
 2    11062
 1     8972
 5     6086
 3     4925
 6     3383
Name: count, dtype: int64


In [129]:
from sklearn.metrics import silhouette_score

labels = df["cluster_hdbscan"].values
mask = labels != -1

silhouette_score(X_scaled[mask], labels[mask])


0.35979345246253747

In [131]:
feature_cols_no_id = [c for c in feature_active_cols if c != "user_id"]

profile = (
    df
    .groupby("cluster_hdbscan")[feature_cols_no_id]
    .mean()
    .round(2)
)

profile


Unnamed: 0_level_0,age,gender_male,device_android,location_urban,location_rural,recency_days,total_transactions,median_trx_amount,credit_card_usage,promo_used_count,...,apparel_product_ratio,home_product_ratio,personal_care_product_ratio,avg_events_per_booking,avg_first_event_booking_to_sec,avg_hibrnate_events,avg_search_events,avg_scroll_events,avg_promo_page_events,ship_hours_diff
cluster_hdbscan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1,23.77,0.38,0.74,0.56,0.21,666.89,2.24,506688.86,0.37,0.38,...,0.43,0.0,0.07,12.34,2426113.72,0.0,0.06,0.08,0.04,84.79
0,23.26,0.35,0.78,0.63,0.17,113.65,24.21,341554.76,0.34,0.35,...,0.51,0.0,0.05,13.39,1545734.81,0.0,0.06,0.09,0.03,84.39
1,23.74,0.35,0.77,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,23.76,1.0,1.0,0.74,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,23.86,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,23.67,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,23.76,0.0,0.0,0.74,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,23.54,1.0,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
