In [1]:
from pathlib import Path
from datasets import Dataset
import sys
import numpy as np
from sklearn.cluster import HDBSCAN

SRC_PATH = Path("../src").resolve()
sys.path.append(str(SRC_PATH))

from embedding.reduction import get_embeddings, pacmap_embeddings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
question_id = 163
data_path = Path(f"../data/{question_id}")
data = Dataset.load_from_disk(data_path)
embeddings = data["embeddings"]

In [3]:
embeddings_ = pacmap_embeddings(embeddings, n_components=10, random_state=42)
embeddings_2d = pacmap_embeddings(embeddings, n_components=2, random_state=42)
embeddings_3d = pacmap_embeddings(embeddings, n_components=3, random_state=42)



In [5]:
clusterer = HDBSCAN(
    cluster_selection_epsilon=0.4, alpha=1, min_samples=20, min_cluster_size=20
)

clusterer.fit(embeddings_)
categories = clusterer.labels_
unique_categories = np.unique(categories)
print(unique_categories)

mask = categories >= 0
# plt.scatter(*embeddings_3d[mask].T, s=2, c=categories[mask], cmap=plt.cm.tab20)
# plt.scatter(*embeddings_3d[~mask].T, s=2, c="grey", alpha=0.1)
# # plt.scatter(*embeddings_2d[categories== 8].T, s=20, cmap=plt.cm.tab10)
# plt.show()

[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92]


In [6]:
import plotly.express as px

fig = px.scatter(
    x=embeddings_2d[mask, 0],
    y=embeddings_2d[mask, 1],
    # z=embeddings_2d[mask, 2],
    color=categories[mask].astype(str),
)

# fig = px.scatter_3d(
#     x=embeddings_3d[~mask, 0],
#     y=embeddings_3d[~mask, 1],
#     z=embeddings_3d[~mask, 2],
# )

fig.update_traces(
    marker=dict(size=1, line=dict(width=0)),
    selector=dict(mode="markers"),
)

fig.show()