In [1]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
import numpy as np

# 1. Load Data
df = pd.read_csv("embedding_umap.csv")
df = df.dropna(subset=["cleaned_reviews"])
documents = df["cleaned_reviews"].tolist()

# 2. Siapkan embedding
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(documents, show_progress_bar=True)

# 3. Inisialisasi model BERTopic
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom')

topic_model = BERTopic(
    language="english",
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    calculate_probabilities=False,
    verbose=True
)

# 4. Fit dan transform
topics, probs = topic_model.fit_transform(documents, embeddings)

# 5. Manual Top Keyword dengan MMR
def mmr(doc_embedding, word_embeddings, words, top_n=10, diversity=0.7):
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding.reshape(1, -1))
    word_similarity = cosine_similarity(word_embeddings)
    
    selected_words = []
    selected_idxs = []
    remaining_idxs = list(range(len(words)))

    selected_idxs.append(np.argmax(word_doc_similarity))
    selected_words.append(words[selected_idxs[0]])
    remaining_idxs.remove(selected_idxs[0])
    
    for _ in range(top_n - 1):
        mmr_scores = []
        for idx in remaining_idxs:
            sim_to_doc = word_doc_similarity[idx][0]
            sim_to_selected = max([word_similarity[idx][j] for j in selected_idxs])
            mmr_score = diversity * sim_to_doc - (1 - diversity) * sim_to_selected
            mmr_scores.append((idx, mmr_score))
        best_idx = sorted(mmr_scores, key=lambda x: x[1], reverse=True)[0][0]
        selected_idxs.append(best_idx)
        selected_words.append(words[best_idx])
        remaining_idxs.remove(best_idx)
    return selected_words

# 6. Ekstrak topik dan hitung MMR
topics_keywords = {}
for topic in topic_model.get_topics().keys():
    if topic == -1:
        continue
    topic_words_scores = topic_model.get_topic(topic)
    words = [w for w, _ in topic_words_scores]
    embeddings_words = embedding_model.encode(words, show_progress_bar=False)
    topic_embedding = np.mean(embedding_model.encode([" ".join(words)]), axis=0)
    top_keywords = mmr(topic_embedding, embeddings_words, words, top_n=10, diversity=0.7)
    topics_keywords[topic] = top_keywords

# 7. Simpan topik ke CSV
topic_keywords_df = pd.DataFrame([
    {"Cluster": k, "Top_Keywords": v}
    for k, v in topics_keywords.items()
])
topic_keywords_df.to_csv("top_topic_keywords.csv", index=False)

# 8. Hitung dominant topic per game
game_grouped = df.groupby("game")["cleaned_reviews"].apply(lambda x: " ".join(x)).reset_index()

vectorizer = TfidfVectorizer()
results = []

for _, row in game_grouped.iterrows():
    game = row["game"]
    review_text = row["cleaned_reviews"]

    similarities = {}
    for topic_id, keywords in topics_keywords.items():
        topic_text = " ".join(keywords)
        try:
            tfidf = vectorizer.fit([review_text, topic_text])
            vecs = tfidf.transform([review_text, topic_text])
            sim = cosine_similarity(vecs[0], vecs[1])[0][0]
            similarities[topic_id] = sim
        except:
            continue

    if similarities:
        dominant_topic = max(similarities, key=similarities.get)
        similarity_score = similarities[dominant_topic]
    else:
        dominant_topic = -1
        similarity_score = 0.0

    results.append({
        "Game": game,
        "Dominant_Topic": dominant_topic,
        "Similarity_Score": similarity_score
    })

# 9. Simpan ke CSV
dominant_topic_df = pd.DataFrame(results)
dominant_topic_df.to_csv("dominant_topic_per_game.csv", index=False)


Couldn't import dot_parser, loading of dot files will not be possible.


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2025-07-25 10:25:16,675 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2025-07-25 10:25:23,440 - BERTopic - Dimensionality - Completed ✓
2025-07-25 10:25:23,441 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-25 10:25:23,474 - BERTopic - Cluster - Completed ✓
2025-07-25 10:25:23,477 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-25 10:25:23,516 - BERTopic - Representation - Completed ✓


In [2]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import gensim
import nltk

nltk.download("punkt")

# 1. Ambil semua dokumen yang digunakan (harus sudah dalam bentuk list of tokens)
tokenized_docs = [nltk.word_tokenize(doc.lower()) for doc in df['cleaned_reviews']]

# 2. Ambil topik dari BERTopic
topics = topic_model.get_topics()
topic_words = []
for topic_id in sorted(topics.keys()):
    if topic_id == -1:
        continue
    # Ambil 10 kata teratas untuk setiap topik
    topic_keywords = [word for word, _ in topics[topic_id][:10]]
    topic_words.append(topic_keywords)

# 3. Buat dictionary dan corpus dari dokumen
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# 4. Hitung coherence
coherence_model = CoherenceModel(
    topics=topic_words,
    texts=tokenized_docs,
    dictionary=dictionary,
    coherence='c_v'  # bisa juga 'u_mass', 'c_uci', 'c_npmi'
)

coherence_score = coherence_model.get_coherence()
print(f"✅ Coherence Score (c_v): {coherence_score:.4f}")


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment varia

✅ Coherence Score (c_v): 0.4137


In [36]:
import ssl
import nltk

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Sekarang bisa download
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /Users/divaoncom/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/divaoncom/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# BERTOPIC with DBI

In [18]:
# Import library
import pandas as pd
import numpy as np
from hdbscan import HDBSCAN
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from sklearn.metrics import davies_bouldin_score
from sentence_transformers import SentenceTransformer
from langdetect import detect, DetectorFactory
import warnings

warnings.filterwarnings("ignore")
DetectorFactory.seed = 0

# 1. Baca dan filter review bahasa Inggris
df = pd.read_csv("embedding_umap.csv")
df = df[df["cleaned_reviews"].notnull()]
df = df[df["cleaned_reviews"].str.strip() != ""]
df = df[df["cleaned_reviews"].apply(lambda x: isinstance(x, str))]
df["lang"] = df["cleaned_reviews"].apply(lambda x: detect(x) if len(x.strip()) > 5 else "unknown")
df = df[df["lang"] == "en"]
docs = df["cleaned_reviews"].tolist()

# 2. Embedding ulang
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(docs, show_progress_bar=True)

# 3. Representation model MMR
representation_model = MaximalMarginalRelevance(diversity=0.3, top_n_words=30)

# 4. Grid search parameter HDBSCAN
results = []
for min_cluster_size in [5, 10, 20]:
    for min_samples in [3, 5, 10]:
        try:
            hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size,
                                    min_samples=min_samples,
                                    metric="euclidean",
                                    prediction_data=True)
            model = BERTopic(
                hdbscan_model=hdbscan_model,
                embedding_model=embedding_model,
                representation_model=representation_model,
                calculate_probabilities=True,
                verbose=False
            )

            topics, probs = model.fit_transform(docs, embeddings)

            # Reduce outlier dengan strategi probabilities
            reduced_topics = model.reduce_outliers(docs, topics,
                                                   strategy="probabilities",
                                                   probabilities=probs,
                                                   threshold=0.1)

            # Update topic model dengan topik yang telah dikurangi outlier-nya
            model.update_topics(docs, topics=reduced_topics,
                                representation_model=representation_model)

            labels = np.array(reduced_topics)
            valid_mask = labels != -1

            if valid_mask.sum() > 1:
                dbi = davies_bouldin_score(np.array(embeddings)[valid_mask], labels[valid_mask])
            else:
                dbi = np.nan

            noise_ratio = round(1 - valid_mask.mean(), 2)
            results.append({
                "min_cluster_size": min_cluster_size,
                "min_samples": min_samples,
                "dbi": dbi,
                "noise_ratio": noise_ratio,
                "num_topics": len(set(labels)) - (1 if -1 in labels else 0),
                "model": model,
                "topics": reduced_topics
            })
        except Exception as e:
            print(f"❌ Error at mcs={min_cluster_size}, ms={min_samples}: {e}")

# 5. Evaluasi hasil
df_results = pd.DataFrame([{
    "min_cluster_size": r["min_cluster_size"],
    "min_samples": r["min_samples"],
    "dbi": r["dbi"],
    "noise_ratio": r["noise_ratio"],
    "num_topics": r["num_topics"]
} for r in results if r["noise_ratio"] <= 0.5])

print("Top 5 konfigurasi terbaik:")
print(df_results.sort_values(by="dbi").head())

# 6. Gunakan model terbaik
if df_results.empty:
    raise ValueError("Tidak ditemukan konfigurasi dengan noise_ratio ≤ 0.5")

best_result = sorted([r for r in results if r["noise_ratio"] <= 0.5], key=lambda x: x["dbi"])[0]
final_model = best_result["model"]
final_topics = best_result["topics"]

# 7. Simpan hasil topik tiap review
df["topic"] = final_topics
df.to_csv("hasil_topic_assignment.csv", index=False)

# 8. Simpan daftar topik dan keyword-nya (dengan MMR)
topic_keywords = []
for tid in final_model.get_topic_info()["Topic"]:
    if tid == -1:
        continue
    kws = final_model.get_topic(tid)
    topic_keywords.append({
        "topic": tid,
        "keywords": ", ".join([word for word, _ in kws])
    })

pd.DataFrame(topic_keywords).to_csv("daftar_topik_keywords_mmr.csv", index=False)

# 9. Ringkasan
print(f"\nTotal review: {len(docs)}")
print(f"Jumlah review dengan topic -1 (noise): {(np.array(final_topics) == -1).sum()}")
print(f"Persentase noise: {round((np.array(final_topics) == -1).mean() * 100, 2)}%")


Batches:   0%|          | 0/39 [00:00<?, ?it/s]



Top 5 konfigurasi terbaik:
   min_cluster_size  min_samples       dbi  noise_ratio  num_topics
0                 5            3  2.864286         0.35          77
1                 5            5  3.147260         0.43          48
4                10            5  3.489536         0.40          30
3                10            3  3.734552         0.32          36
2                 5           10  3.736815         0.41          22

Total review: 1241
Jumlah review dengan topic -1 (noise): 436
Persentase noise: 35.13%


In [19]:
import pandas as pd

df = pd.read_csv("hasil_topic_assignment.csv")

total_reviews = len(df)
noise_reviews = (df["topic"] == -1).sum()

noise_ratio = noise_reviews/ total_reviews * 100

print(f"Total review: {total_reviews}")
print(f"Jumlah review dengan topic -1 (noise): {noise_reviews}")
print(f"Persentase noise: {noise_ratio:.2f}%")

Total review: 1241
Jumlah review dengan topic -1 (noise): 436
Persentase noise: 35.13%


# BERTopic, DBI, with hierarchical clustering

In [27]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from hdbscan import HDBSCAN
from sklearn.metrics import davies_bouldin_score
from sentence_transformers import SentenceTransformer
from langdetect import detect, DetectorFactory
import warnings

warnings.filterwarnings("ignore")
DetectorFactory.seed = 0

# Baca dan filter data bahasa Inggris
df = pd.read_csv("embedding_umap.csv")
mask = df["cleaned_reviews"].notnull() & df["cleaned_reviews"].str.strip().astype(bool)
df = df[mask & df["cleaned_reviews"].apply(lambda x: isinstance(x, str))]
df["lang"] = df["cleaned_reviews"].apply(lambda x: detect(x) if len(x.strip()) > 5 else "unknown")
df = df[df["lang"] == "en"]
docs = df["cleaned_reviews"].tolist()

emb_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = emb_model.encode(docs, show_progress_bar=True)

rep_model = MaximalMarginalRelevance(diversity=0.3, top_n_words=30)

results = []
for mcs in [5, 10, 20]:
    for ms in [3, 5, 10]:
        try:
            hdb = HDBSCAN(min_cluster_size=mcs, min_samples=ms, metric="euclidean", prediction_data=True)
            model = BERTopic(hdbscan_model=hdb,
                             embedding_model=emb_model,
                             representation_model=rep_model,
                             calculate_probabilities=True,
                             verbose=False)
            topics, probs = model.fit_transform(docs, embeddings)
            reduced = model.reduce_outliers(docs, topics,
                                            strategy="probabilities",
                                            probabilities=probs,
                                            threshold=0.1)
            model.update_topics(docs, topics=reduced, representation_model=rep_model)

            labels = np.array(reduced)
            valid = labels != -1
            dbi = davies_bouldin_score(embeddings[valid], labels[valid]) if valid.sum() > 1 else np.nan
            noise = round(1 - valid.mean(), 2)
            results.append((dbi, noise, model, reduced))
        except Exception:
            continue

cands = [r for r in results if r[1] <= 0.5]
if not cands:
    raise ValueError("Tidak ditemukan model dengan noise ≤ 50%")
best = sorted(cands, key=lambda x: x[0])[0]
final_model, final_topics = best[2], np.array(best[3])

df["topic"] = final_topics
df.to_csv("hasil_topic_assignment.csv", index=False)

topic_keywords = [
    {"topic": tid, "keywords": ", ".join([w for w, _ in final_model.get_topic(tid)])}
    for tid in final_model.get_topic_info()["Topic"] if tid != -1
]
pd.DataFrame(topic_keywords).to_csv("daftar_topik_keywords_mmr.csv", index=False)

noise_count = int((final_topics == -1).sum())
noise_percent = round(noise_count / len(final_topics) * 100, 2)
print(f"Persentase noise akhir: {noise_percent}%")

Batches:   0%|          | 0/39 [00:00<?, ?it/s]



Persentase noise akhir: 34.73%


In [32]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from hdbscan import HDBSCAN
from sklearn.metrics import davies_bouldin_score
from sentence_transformers import SentenceTransformer
from langdetect import detect, DetectorFactory
import warnings

warnings.filterwarnings("ignore")
DetectorFactory.seed = 0

df = pd.read_csv("embedding_umap.csv")
df = df[df["cleaned_reviews"].notnull() & df["cleaned_reviews"].str.strip().astype(bool)]
df = df[df["cleaned_reviews"].apply(lambda x: isinstance(x, str))]
df["lang"] = df["cleaned_reviews"].apply(lambda x: detect(x) if len(x.strip()) > 5 else "unknown")
df = df[df["lang"] == "en"]
docs = df["cleaned_reviews"].tolist()

emb_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = emb_model.encode(docs, show_progress_bar=True)

rep_model = MaximalMarginalRelevance(diversity=0.3, top_n_words=30)

results = []
for mcs in [5, 10, 20]:
    for ms in [3, 5, 10]:
        try:
            hdb = HDBSCAN(min_cluster_size=mcs, min_samples=ms, metric="euclidean", prediction_data=True)
            model = BERTopic(hdbscan_model=hdb, embedding_model=emb_model,
                             representation_model=rep_model, calculate_probabilities=True, verbose=False)
            topics, probs = model.fit_transform(docs, embeddings)
            reduced = model.reduce_outliers(docs, topics,
                                            strategy="probabilities", probabilities=probs, threshold=0.1)
            model.update_topics(docs, topics=reduced, representation_model=rep_model)

            labels = np.array(reduced)
            valid = labels != -1
            dbi = davies_bouldin_score(embeddings[valid], labels[valid]) if valid.sum() > 1 else np.nan
            noise = round(1 - valid.mean(), 2)
            results.append((dbi, noise, model, reduced))
        except Exception:
            continue

cands = [r for r in results if r[1] <= 0.5]
if not cands:
    raise ValueError("Tidak ditemukan model dengan noise ≤ 50%")
best = sorted(cands, key=lambda x: x[0])[0]
final_model, final_topics = best[2], np.array(best[3])

df["topic"] = final_topics
df.to_csv("hasil_topic_assignment.csv", index=False)

topic_keywords = [
    {"topic": tid, "keywords": ", ".join([w for w, _ in final_model.get_topic(tid)])}
    for tid in final_model.get_topic_info()["Topic"] if tid != -1
]
pd.DataFrame(topic_keywords).to_csv("daftar_topik_keywords_mmr.csv", index=False)

noise_count = int((final_topics == -1).sum())
noise_percent = round(noise_count / len(final_topics) * 100, 2)
print(f"Persentase noise akhir: {noise_percent}%")


Batches:   0%|          | 0/39 [00:00<?, ?it/s]



Persentase noise akhir: 31.18%


In [47]:
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"

hier = final_model.hierarchical_topics(docs)
fig_hier = final_model.visualize_hierarchy(hierarchical_topics=hier)
fig_hier.show()


  0%|          | 0/76 [00:00<?, ?it/s]

100%|██████████| 76/76 [00:10<00:00,  7.56it/s]


In [49]:
# Contoh kelompok topik dekat secara visual
topics_to_merge = [
    [61, 33],  # puzzle...
    [51, 13],
    [18, 40],
    [2, 62, 6],
    [8, 71],
    [52, 55],
    [20, 26],
    [19, 73],
]

# Lakukan merge
final_model.merge_topics(docs, topics_to_merge=topics_to_merge)

# Visualisasi topik setelah merge
fig_bar = final_model.visualize_barchart(top_n_topics=15)
fig_bar.show()

# Perbarui dataframe assignment
df["topic"] = final_model.topics_
df.to_csv("hasil_topic_assignment.csv", index=False)

# Simpan keywords untuk masing-masing topik (dengan MMR)
topic_keywords = [
    {
      "topic": tid,
      "keywords": ", ".join([w for w, _ in final_model.get_topic(tid)])
    }
    for tid in final_model.get_topic_info()["Topic"] if tid != -1
]
pd.DataFrame(topic_keywords).to_csv("merged_topic.csv", index=False)
