In [1]:
import pandas as pd
from hdbscan import HDBSCAN
from sklearn.preprocessing import normalize

# Load data embedding
embedding_df = pd.read_csv("embedding_umap.csv")

# Ambil embedding kolom yang sesuai (bisa embedding_0, UMAP_1, UMAP_2, dst)
X = embedding_df[['UMAP_1', 'UMAP_2']].values
X = normalize(X)

# Clustering
clusterer = HDBSCAN(min_cluster_size=5)
embedding_df['cluster'] = clusterer.fit_predict(X)

# Simpan ulang hasil
embedding_df.to_csv("embedding_umap_clustered.csv", index=False)




In [2]:
import pandas as pd

# -----------------------------
# STEP 1: Load data
# -----------------------------
embedding_df = pd.read_csv("embedding_umap_clustered.csv")        # berisi cleaned_Reviews dan cluster
preprocessed_df = pd.read_csv("preprocessed_Reviews.csv")         # berisi Game dan cleaned_Reviews
topic_keywords_df = pd.read_csv("final_topic_keywordsv2.csv")       # berisi cluster dan keywords (list/str)

# Ubah kolom keywords ke list jika belum
def parse_keywords(x):
    if isinstance(x, str):
        return eval(x) if x.startswith("[") else x.split(",")
    return x

topic_keywords_df['keywords'] = topic_keywords_df['keywords'].apply(parse_keywords)

# Buat mapping cluster -> keywords
cluster_keywords_map = dict(zip(topic_keywords_df['cluster'], topic_keywords_df['keywords']))

# -----------------------------
# STEP 2: Gabungkan cluster ke preprocessed_df
# -----------------------------
merged_df = pd.merge(preprocessed_df, embedding_df[['cleaned_Reviews', 'cluster']], 
                     on='cleaned_Reviews', how='left')

# -----------------------------
# STEP 3: Buang noise (cluster = -1)
# -----------------------------
merged_df = merged_df[merged_df['cluster'] != -1]

# -----------------------------
# STEP 4: Hitung jumlah game per cluster (untuk cari cluster umum)
# -----------------------------
game_per_cluster = merged_df.groupby('cluster')['Game'].nunique()
threshold = 0.10 * merged_df['Game'].nunique()  # misal: >10% game
common_clusters = game_per_cluster[game_per_cluster > threshold].index.tolist()

# -----------------------------
# STEP 5: Hitung topik dominan per game (tanpa common clusters)
# -----------------------------
topic_per_game = merged_df[~merged_df['cluster'].isin(common_clusters)] \
    .groupby(['Game', 'cluster']) \
    .size().reset_index(name='count')

top_topics = topic_per_game.sort_values(['Game', 'count'], ascending=[True, False])
top_topics = top_topics.groupby('Game').head(3)

# Gabungkan list cluster per game
top_topics_agg = top_topics.groupby('Game')['cluster'].apply(list).reset_index()
top_topics_agg.columns = ['Game', 'Top_Clusters']

# -----------------------------
# STEP 6: Mapping ke keywords
# -----------------------------
def map_clusters_to_keywords(cluster_list, keyword_map):
    return [keyword_map[c] for c in cluster_list if c in keyword_map]

top_topics_agg['Top_Topic_Keywords'] = top_topics_agg['Top_Clusters'].apply(
    lambda cl: map_clusters_to_keywords(cl, cluster_keywords_map)
)

# -----------------------------
# STEP 7: Simpan ke CSV
# -----------------------------
top_topics_agg.to_csv("game_top_topics_keywords.csv", index=False)
print("✅ File berhasil disimpan sebagai 'game_top_topics_keywords.csv'")


✅ File berhasil disimpan sebagai 'game_top_topics_keywords.csv'
