In [None]:
import json
import numpy as np

from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
# from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import euclidean_distances

In [None]:
def tsne_kmeans(emb_raw, n_clusters=8, n_components=3, perplexity=30):
  mTSNE = TSNE(n_components=n_components, perplexity=perplexity, random_state=10)
  mCluster = KMeans(n_clusters=n_clusters, random_state=1010)

  emb_reduced = mTSNE.fit_transform(StandardScaler().fit_transform(emb_raw))
  emb_clusters = mCluster.fit_predict(emb_reduced)

  return emb_reduced, emb_clusters, mCluster.cluster_centers_

In [None]:
with open("./20250515_embeddings.json", "r") as ifp:
  data = json.load(ifp)

ids = np.array(list(data.keys()))
embeddings = np.array([v["clip"] for v in data.values()])

In [None]:
embs, clusters, centers = tsne_kmeans(embeddings, n_clusters=8)
cluster_distances = euclidean_distances(embs, centers)

i_c_d = zip(ids.tolist(), clusters.tolist(), cluster_distances.tolist())

cluster_data = {id: {"cluster": c, "distances": [round(d,6) for d in ds]} for  id,c,ds in i_c_d}

json.dumps(cluster_data)

In [None]:
cluster_data = {}

for n in [2,4,6,8,10,12,14,16]:
  embs, clusters, centers = tsne_kmeans(embeddings, n_clusters=n)
  cluster_distances = euclidean_distances(embs, centers)

  i_c_d = zip(ids.tolist(), clusters.tolist(), cluster_distances.tolist())

  cluster_data[n] = {id: {"cluster": c, "distances": [round(d,6) for d in ds]} for  id,c,ds in i_c_d}

In [None]:
with open("./20250515_clusters.json", "w") as ofp:
  json.dump(cluster_data, ofp, separators=(",",":"), sort_keys=True, ensure_ascii=False)