In [20]:
import numpy as np
import hdbscan
from pathlib import Path
import pandas as pd

# Parameters
EMBEDDINGS_PATH = Path("../embeddings/vectors.npy")  # adapt to your file
OUTPUT_LABELS_PATH = Path("../embeddings/cluster_labels.npy")
OUTPUT_INFO_PATH = Path("../embeddings/cluster_info.csv")

# 1) Load embeddings (N x d)
embeddings = np.load(str(EMBEDDINGS_PATH))  # shape (N, d)


In [21]:
embeddings.shape

(2710806, 384)

In [22]:
# 1.5) Reduce dimensionality (optional)
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
embeddings = pca.fit_transform(embeddings)  # shape (N, d')
# Note: PCA is optional and can be skipped if embeddings are already low-dimensional.


In [23]:
embeddings.shape

(2710806, 10)

In [None]:
# 2) Run HDBSCAN
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=50,    # tune as needed
    metric='euclidean',     # or 'cosine'
    cluster_selection_method='eom'
)
labels = clusterer.fit_predict(embeddings)  # shape (N,)


In [None]:

# 3) Save cluster labels
np.save(str(OUTPUT_LABELS_PATH), labels)

# 4) Compute cluster sizes and example medoids
labels_series = pd.Series(labels, name="cluster_id")
cluster_sizes = labels_series.value_counts().sort_index().rename("size")
cluster_sizes_df = cluster_sizes.reset_index().rename(columns={"index": "cluster_id"})

# Optionally, compute centroids (mean per cluster)
centroids = []
for cid in sorted(cluster_sizes_df['cluster_id']):
    if cid == -1:
        continue  # skip noise
    cluster_embs = embeddings[labels == cid]
    centroids.append(cluster_embs.mean(axis=0))
cluster_sizes_df["centroid_index"] = [i for i in cluster_sizes_df['cluster_id'] if i != -1]

# Save cluster info
cluster_sizes_df.to_csv(str(OUTPUT_INFO_PATH), index=False)

# Display summary to user
import ace_tools as tools; tools.display_dataframe_to_user(
    name="HDBSCAN Cluster Summary",
    dataframe=cluster_sizes_df.head(10)  # show top 10 clusters
)


In [2]:
!ls

clustering.ipynb  tests.ipynb
