In [3]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
import numpy as np

# 1. Load Data
df = pd.read_csv("embedding_umap.csv")
df = df.dropna(subset=["cleaned_reviews"])
documents = df["cleaned_reviews"].tolist()

# 2. Siapkan embedding
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(documents, show_progress_bar=True)

# 3. Inisialisasi model BERTopic
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom')

topic_model = BERTopic(
    language="english",
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    calculate_probabilities=False,
    verbose=True
)

# 4. Fit dan transform
topics, probs = topic_model.fit_transform(documents, embeddings)

# 5. Manual Top Keyword dengan MMR
def mmr(doc_embedding, word_embeddings, words, top_n=10, diversity=0.7):
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding.reshape(1, -1))
    word_similarity = cosine_similarity(word_embeddings)
    
    selected_words = []
    selected_idxs = []
    remaining_idxs = list(range(len(words)))

    selected_idxs.append(np.argmax(word_doc_similarity))
    selected_words.append(words[selected_idxs[0]])
    remaining_idxs.remove(selected_idxs[0])
    
    for _ in range(top_n - 1):
        mmr_scores = []
        for idx in remaining_idxs:
            sim_to_doc = word_doc_similarity[idx][0]
            sim_to_selected = max([word_similarity[idx][j] for j in selected_idxs])
            mmr_score = diversity * sim_to_doc - (1 - diversity) * sim_to_selected
            mmr_scores.append((idx, mmr_score))
        best_idx = sorted(mmr_scores, key=lambda x: x[1], reverse=True)[0][0]
        selected_idxs.append(best_idx)
        selected_words.append(words[best_idx])
        remaining_idxs.remove(best_idx)
    return selected_words

# 6. Ekstrak topik dan hitung MMR
topics_keywords = {}
for topic in topic_model.get_topics().keys():
    if topic == -1:
        continue
    topic_words_scores = topic_model.get_topic(topic)
    words = [w for w, _ in topic_words_scores]
    embeddings_words = embedding_model.encode(words, show_progress_bar=False)
    topic_embedding = np.mean(embedding_model.encode([" ".join(words)]), axis=0)
    top_keywords = mmr(topic_embedding, embeddings_words, words, top_n=10, diversity=0.7)
    topics_keywords[topic] = top_keywords

# 7. Simpan topik ke CSV
topic_keywords_df = pd.DataFrame([
    {"Cluster": k, "Top_Keywords": v}
    for k, v in topics_keywords.items()
])
topic_keywords_df.to_csv("top_topic_keywords.csv", index=False)

# 8. Hitung dominant topic per game
game_grouped = df.groupby("game")["cleaned_reviews"].apply(lambda x: " ".join(x)).reset_index()

vectorizer = TfidfVectorizer()
results = []

for _, row in game_grouped.iterrows():
    game = row["game"]
    review_text = row["cleaned_reviews"]

    similarities = {}
    for topic_id, keywords in topics_keywords.items():
        topic_text = " ".join(keywords)
        try:
            tfidf = vectorizer.fit([review_text, topic_text])
            vecs = tfidf.transform([review_text, topic_text])
            sim = cosine_similarity(vecs[0], vecs[1])[0][0]
            similarities[topic_id] = sim
        except:
            continue

    if similarities:
        dominant_topic = max(similarities, key=similarities.get)
        similarity_score = similarities[dominant_topic]
    else:
        dominant_topic = -1
        similarity_score = 0.0

    results.append({
        "Game": game,
        "Dominant_Topic": dominant_topic,
        "Similarity_Score": similarity_score
    })

# 9. Simpan ke CSV
dominant_topic_df = pd.DataFrame(results)
dominant_topic_df.to_csv("dominant_topic_per_game.csv", index=False)


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2025-07-24 09:01:22,116 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-24 09:01:24,307 - BERTopic - Dimensionality - Completed ✓
2025-07-24 09:01:24,307 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-24 09:01:24,345 - BERTopic - Cluster - Completed ✓
2025-07-24 09:01:24,348 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-24 09:01:24,387 - BERTopic - Representation - Completed ✓


In [6]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import gensim
import nltk

nltk.download("punkt")

# 1. Ambil semua dokumen yang digunakan (harus sudah dalam bentuk list of tokens)
tokenized_docs = [nltk.word_tokenize(doc.lower()) for doc in df['cleaned_reviews']]

# 2. Ambil topik dari BERTopic
topics = topic_model.get_topics()
topic_words = []
for topic_id in sorted(topics.keys()):
    if topic_id == -1:
        continue
    # Ambil 10 kata teratas untuk setiap topik
    topic_keywords = [word for word, _ in topics[topic_id][:10]]
    topic_words.append(topic_keywords)

# 3. Buat dictionary dan corpus dari dokumen
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# 4. Hitung coherence
coherence_model = CoherenceModel(
    topics=topic_words,
    texts=tokenized_docs,
    dictionary=dictionary,
    coherence='c_v'  # bisa juga 'u_mass', 'c_uci', 'c_npmi'
)

coherence_score = coherence_model.get_coherence()
print(f"✅ Coherence Score (c_v): {coherence_score:.4f}")


[nltk_data] Downloading package punkt to /Users/divaoncom/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The c

✅ Coherence Score (c_v): 0.3915


In [4]:
import ssl
import nltk

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Sekarang bisa download
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /Users/divaoncom/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/divaoncom/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True