In [1]:
import pandas as pd
from hdbscan import HDBSCAN
from sklearn.preprocessing import normalize

# Load data embedding
embedding_df = pd.read_csv("embedding_umap.csv")

# Ambil embedding kolom yang sesuai (bisa embedding_0, UMAP_1, UMAP_2, dst)
X = embedding_df[['UMAP_1', 'UMAP_2']].values
X = normalize(X)

# Clustering
clusterer = HDBSCAN(min_cluster_size=5)
embedding_df['cluster'] = clusterer.fit_predict(X)

# Simpan ulang hasil
embedding_df.to_csv("embedding_umap_clustered.csv", index=False)




In [4]:
# Load semua data yang dibutuhkan
embedding_df = pd.read_csv("embedding_umap_clustered.csv")  # sudah ada cluster
preprocessed_df = pd.read_csv("preprocessed_reviews.csv")   # punya game + cleaned_reviews
topic_labels_df = pd.read_csv("final_topic_keywords_labeled.csv")   # cluster -> topic_label_en

# Gabungkan cluster ke preprocessed_df
merged_df = pd.merge(preprocessed_df, embedding_df[['cleaned_reviews', 'cluster']], on='cleaned_reviews', how='left')

# Buang noise (cluster = -1)
merged_df = merged_df[merged_df['cluster'] != -1]

# Hitung topik terbanyak per game
topic_per_game = merged_df.groupby(['game', 'cluster']).size().reset_index(name='count')
top_topics = topic_per_game.sort_values(['game', 'count'], ascending=[True, False])
top_topics = top_topics.groupby('game').head(3)

# Gabungkan list cluster per game
top_topics_agg = top_topics.groupby('game')['cluster'].apply(list).reset_index()
top_topics_agg.columns = ['game', 'Top_Clusters']

# Gabungkan ke label topik
def map_clusters_to_labels(cluster_list, label_map):
    return [label_map[c] for c in cluster_list if c in label_map]

# Buat mapping cluster -> label
cluster_label_map = dict(zip(topic_labels_df['cluster'], topic_labels_df['topic_label_en']))

# Tambahkan kolom label
top_topics_agg['Top_Topic_Labels'] = top_topics_agg['Top_Clusters'].apply(lambda cl: map_clusters_to_labels(cl, cluster_label_map))

# Simpan hasil akhir
top_topics_agg.to_csv("game_top_topics_labeled.csv", index=False)
