In [2]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances
from itertools import combinations

# 1. Load embedding
df = pd.read_csv("embedding_umap.csv")
emb_cols = [c for c in df.columns if c.startswith("embedding_")]
X = df[emb_cols].values

# 2. Core distance: jarak ke tetangga ke-k (misal k = 15)
k = 15
nn = NearestNeighbors(n_neighbors=k, metric='euclidean').fit(X)
dists, _ = nn.kneighbors(X)
core_dist = dists[:, -1]  # distance ke tetangga ke-15
df["core_k"] = core_dist

# 3. Hitung pasangan dokumen (misal 0-1, 0-2, 1-2, dst.)
pairs = list(combinations(range(len(df)), 2))

# 4. Bangun tabel simulasi MRD
rows = []
for i, j in pairs:
    dist_ij = np.linalg.norm(X[i] - X[j])
    core_i = core_dist[i]
    core_j = core_dist[j]
    mrd = max(core_i, core_j, dist_ij)
    rows.append({
        "pair": f"{i}-{j}",
        "core_a": round(core_i, 6),
        "core_b": round(core_j, 6),
        "distance": round(dist_ij, 6),
        "mrd": round(mrd, 6)
    })

mrd_table = pd.DataFrame(rows)

# 5. Simpan ke CSV (opsional)
mrd_table.to_csv("mrd_simulation_format.csv", index=False)

# Tampilkan sebagian
mrd_table.head(10)


Unnamed: 0,pair,core_a,core_b,distance,mrd
0,0-1,0.002983,0.002706,0.016928,0.016928
1,0-2,0.002983,0.00431,0.011155,0.011155
2,0-3,0.002983,0.002346,0.061025,0.061025
3,0-4,0.002983,0.003247,0.000332,0.003247
4,0-5,0.002983,0.003398,0.078901,0.078901
5,0-6,0.002983,0.001658,0.088928,0.088928
6,0-7,0.002983,0.002148,0.004678,0.004678
7,0-8,0.002983,0.002221,0.011034,0.011034
8,0-9,0.002983,0.002885,0.009064,0.009064
9,0-10,0.002983,0.002558,0.014474,0.014474


In [3]:
import pandas as pd
from hdbscan import HDBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances

# Load file
df = pd.read_csv("embedding_umap.csv")
X = df[[c for c in df.columns if c.startswith("embedding_")]].values
umap_x = df["UMAP_1"]
umap_y = df["UMAP_2"]

# Hitung core distance (tetangga ke-k)
k = 15
nn = NearestNeighbors(n_neighbors=k).fit(X)
dists, _ = nn.kneighbors(X)
core_dists = dists[:, -1]  # core_k distance = tetangga ke-k

# Jalankan HDBSCAN
clusterer = HDBSCAN(min_samples=k, min_cluster_size=10)
cluster_labels = clusterer.fit_predict(X)

# Gabungkan ke tabel hasil
result = pd.DataFrame({
    "Doc_id": df.index + 1,         # atau pakai df["Game"] jika ada
    "X": umap_x,
    "Y": umap_y,
    "MRD": core_dists,
    "Cluster": cluster_labels
})

# Simpan hasil
result.to_csv("final_mrd_clustered.csv", index=False)


