In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import hdbscan
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Load data hasil embedding + preprocessing review
embedding_df = pd.read_csv('embedding_umap.csv')  # Sesuaikan nama jika berbeda

# Cek kolom
embedding_df.head()


Unnamed: 0,game,cleaned_reviews,num_words,review_length,embedding_0,UMAP_1,UMAP_2
0,L.A. Noire,I like push npc car die,6,23,-0.097457,13.306396,8.543323
1,L.A. Noire,damn solve case satisfying,4,26,-0.02998,14.574967,6.994128
2,L.A. Noire,play like detective tv series case per day,8,42,-0.024095,11.028924,10.414371
3,L.A. Noire,case really easy fuck case hard beat final bos...,26,157,-0.049497,13.146644,9.903057
4,L.A. Noire,play police side gta coin deal mess maniacs le...,281,1801,-0.035496,10.380385,11.550258


In [2]:
# Ambil kolom untuk clustering (biasanya hasil UMAP, misalnya: ['x', 'y'])
clusterer = hdbscan.HDBSCAN(min_cluster_size=5)
embedding_df['cluster'] = clusterer.fit_predict(embedding_df[['UMAP_1', 'UMAP_2']])
embedding_df['cluster'].value_counts()




cluster
-1     606
 6     190
 22     68
 14     52
 27     32
      ... 
 28      5
 3       5
 43      5
 45      5
 62      5
Name: count, Length: 71, dtype: int64

In [3]:
from scipy.sparse import csr_matrix

# Filter hanya cluster valid (selain -1)
valid_df = embedding_df[embedding_df['cluster'] != -1]

# Pastikan kolom cleaned_reviews bertipe string dan tidak mengandung NaN
valid_df['cleaned_reviews'] = valid_df['cleaned_reviews'].fillna('').astype(str)

# Gabungkan review dalam tiap cluster
docs_per_topic = valid_df.groupby('cluster')['cleaned_reviews'].apply(lambda x: ' '.join(x)).reset_index()

# TF Count
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs_per_topic['cleaned_reviews'])
words = vectorizer.get_feature_names_out()

# Fungsi perhitungan c-TF-IDF
def compute_ctfidf(X, m):
    df = np.diff(X.tocsc().indptr)
    idf = np.log(m / (1 + df))
    ctfidf = X.multiply(idf)
    return ctfidf

# Hitung c-TF-IDF
m = len(docs_per_topic)
ctfidf = compute_ctfidf(X, m)
ctfidf_array = ctfidf.toarray()

# Buat DataFrame hasil c-TF-IDF
ctfidf_df = pd.DataFrame(ctfidf_array, columns=words)
ctfidf_df['cluster'] = docs_per_topic['cluster']
ctfidf_df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df['cleaned_reviews'] = valid_df['cleaned_reviews'].fillna('').astype(str)


Unnamed: 0,aaa,aaaaaaaahhh,aard,abandon,abandonware,abbey,abhorrent,ability,able,abnormality,...,zevkli,zip,zippy,zoltan,zombie,zone,zoom,zu,zzz,cluster
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.555348,0.0,0.0,0.0,0.0,0.0,0.0,7.110696,0.0,2
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4


In [4]:
# Ambil 10 kata kunci teratas untuk tiap cluster
top_k = 10
top_keywords = {}

for i, row in ctfidf_df.iterrows():
    cluster = row['cluster']
    row = row.drop('cluster')
    top_words = row.sort_values(ascending=False).head(top_k).index.tolist()
    top_keywords[cluster] = top_words

# Lihat hasil
top_keywords


{0.0: ['batman',
  'arkham',
  'bruce',
  'wayne',
  'bir',
  'villain',
  'majority',
  'case',
  'harvey',
  'dc'],
 1.0: ['hitman',
  'diablo',
  'franchise',
  'diana',
  'unforgive',
  'freelancer',
  'background',
  'dosent',
  'heaven',
  'ultimate'],
 2.0: ['de', 'le', 'bir', 'nicht', 'oyun', 'jeu', 'vous', 'en', 'den', 'un'],
 3.0: ['soul',
  'dark',
  'otherwise',
  'wierdly',
  'dogg',
  'cardinal',
  'snoop',
  'muh',
  'shut',
  'nobody'],
 4.0: ['imersive',
  'play',
  'forgot',
  'accedently',
  'playground',
  'fiddle',
  'call',
  'dad',
  'phone',
  'shortly'],
 5.0: ['paint',
  'assassin',
  'average',
  'cry',
  'hour',
  'story',
  'creed',
  'remember',
  'beautiful',
  'end'],
 6.0: ['pew',
  'us',
  'love',
  'blow',
  'yes',
  'laugh',
  'boom',
  'government',
  'alyxe',
  'jingle'],
 7.0: ['half',
  'life',
  'source',
  'series',
  'yay',
  'insane',
  'teary',
  'ahhh',
  'hl',
  'yes'],
 8.0: ['efootball',
  'pe',
  'sale',
  'buy',
  'buyyyy',
  'leftover

In [5]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load model embedding (bisa diganti sesuai model BERT yang kamu pakai)
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Fungsi MMR
def mmr(doc_embedding, word_embeddings, words, top_n, diversity):
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding.reshape(1, -1))
    word_similarity = cosine_similarity(word_embeddings)

    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)
        mmr_dist = (1 - diversity) * candidate_similarities.reshape(-1) - diversity * target_similarities
        next_idx = candidates_idx[np.argmax(mmr_dist)]
        keywords_idx.append(next_idx)
        candidates_idx.remove(next_idx)

    return [words[i] for i in keywords_idx]


Couldn't import dot_parser, loading of dot files will not be possible.


In [6]:
# Simpan hasil akhir MMR per cluster
final_keywords = []

for i, row in ctfidf_df.iterrows():
    cluster = row['cluster']
    row = row.drop('cluster')
    
    # Ambil 20 kata dengan skor c-TF-IDF tertinggi
    top_words = row.sort_values(ascending=False).head(20)
    words = top_words.index.tolist()
    scores = top_words.values.tolist()

    # Buat embedding kata dan cluster
    word_embeddings = embedder.encode(words, convert_to_tensor=False)
    topic_embedding = np.mean(word_embeddings, axis=0)

    # Jalankan MMR
    selected_keywords = mmr(
        doc_embedding=topic_embedding,
        word_embeddings=word_embeddings,
        words=words,
        top_n=5,
        diversity=0.7
    )

    final_keywords.append({'cluster': cluster, 'keywords': selected_keywords})

# Konversi ke DataFrame
mmr_keywords_df = pd.DataFrame(final_keywords)
mmr_keywords_df.head()


Unnamed: 0,cluster,keywords
0,0.0,"[batman, ak, hardly, case, ama]"
1,1.0,"[ultimate, freelancer, alone, obviously, hitman]"
2,2.0,"[en, fakat, oyun, schnell, vous]"
3,3.0,"[black, wierdly, tower, alot, jack]"
4,4.0,"[piece, con, next, poop, call]"


In [7]:
# Contoh output
print(mmr_keywords_df)

# Simpan jika perlu
mmr_keywords_df.to_csv("final_topic_keywords.csv", index=False)


    cluster                                          keywords
0       0.0                   [batman, ak, hardly, case, ama]
1       1.0  [ultimate, freelancer, alone, obviously, hitman]
2       2.0                  [en, fakat, oyun, schnell, vous]
3       3.0               [black, wierdly, tower, alot, jack]
4       4.0                    [piece, con, next, poop, call]
..      ...                                               ...
65     65.0             [blandest, containment, rb, xy, duel]
66     66.0          [sound, sorely, holmes, chapter, crysis]
67     67.0    [unknown, animation, limit, invasion, mcpixel]
68     68.0             [release, messy, workforce, ru, list]
69     69.0             [read, episode, hz, panzer, elevator]

[70 rows x 2 columns]


In [8]:
import pandas as pd

# Baca data
df = pd.read_csv("final_topic_keywords.csv")

# Mapping label
topic_labels_en = {
    0: "Story-driven action games with iconic characters",
    1: "Stealth or single-player action games",
    2: "Non-English reviews (Multilingual content)",
    3: "Mixed gameplay experience with confusing mechanics",
    4: "Negative reviews and criticism",
    5: "Underrated narrative games with emotional elements",
    6: "Mixed genre games with exaggerated style or content",
    7: "Hype for expansions and intense fan reactions",
    8: "Disappointment with popular titles or sports games",
    9: "Low-value free content and monetization critique",
    10: "Criticism of abandoned or poorly supported games",
    11: "Indie games with artistic or emotional tones",
    12: "Frustration with basic mechanics and control",
    13: "Sci-fi or space-themed character-driven games",
    14: "Harsh critique of difficulty or dark themes",
    15: "Slow-paced narrative games or roguelike indie titles",
    16: "Crime and satire themed games",
    17: "Strongly recommended ARPGs or popular game mechanics",
    18: "Fun casual games with unexpected mechanics",
    19: "Boss fights and quirky or controversial moments",
    20: "Games reflecting real-world issues or modern society",
    21: "Strategy or RPG games with rich lore",
    22: "Relaxing and unique games praised by players",
    23: "Tactical turn-based crime or war games",
    24: "Online multiplayer chaos and social platform integration",
    25: "Buggy or rushed games with broken mechanics",
    26: "Creative shooter games with mixed reception",
    27: "Deck-building and strategy card games",
    28: "Tactical shooter games with base/resource management",
    29: "Modding communities and sandbox experimentation",
    30: "Post-apocalyptic survival city-building games",
    31: "Over-the-top sports or chaotic party games",
    32: "Platformers with customization and streamer influence",
    33: "Poor network performance and developer criticism",
    34: "Technical issues and game optimization complaints",
    35: "Narrative sci-fi or SCP-themed single-player campaigns",
    36: "Multiplayer instability and EA-related frustrations",
    37: "Frequent updates with questionable balance or changes",
    38: "Chaotic or adult-themed parody games",
    39: "Poor content value and repeated criticism of Bethesda",
    40: "Console version issues and management decisions",
    41: "Police/crime/zombie series with repetitive gameplay",
    42: "Military-themed remakes with mixed mechanics",
    43: "Masterpiece games by visionary creators (e.g. Hideo Kojima)",
    44: "Historical war campaigns with strategic flaws",
    45: "Stylized browser/mod games with casual mechanics",
    46: "Campy or frustrating horror games",
    47: "Historically-inspired narrative or adventure games",
    48: "Vampire or assassin-themed games with dramatic tone",
    49: "Primitive or philosophical indie games",
    50: "Stylized horror games by known indie studios (e.g., Klei)",
    51: "Timed updates and DLC management issues (e.g., Assassin’s Creed)",
    52: "Frustrating space adventure games",
    53: "Action-packed arcade or PvP games with chaotic gameplay",
    54: "Puzzle-platformers or runner games",
    55: "Visually rich games with technical art focus",
    56: "Thoughtful single-player puzzle or logic games",
    57: "Games with rich backstories and moral themes (e.g., MGS-style)",
    58: "Emotional story arcs and character transformation",
    59: "Classic-style horror games with strong fan praise",
    60: "Sci-fi puzzle games with philosophical undertones",
    61: "Adventure games with investigative or snowy themes",
    62: "Mythology-inspired story-based games",
    63: "Story-driven mysteries or cinematic adventure games",
    64: "Western-themed or poorly paced narrative games",
    65: "Sci-fi battle or containment-themed PvP games",
    66: "Episodic or detective-action games with lacking polish",
    67: "Surreal or unpredictable indie experiments",
    68: "Sloppy game launches and studio mismanagement",
    69: "Episodic war or alternate-history narratives"
}

# Tambahkan kolom label ke DataFrame
df['topic_label_en'] = df['cluster'].map(topic_labels_en)

# Simpan ke file
df.to_csv("final_topic_keywords_labeled.csv", index=False)


In [9]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from nltk.tokenize import word_tokenize
import nltk

# Unduh tokenizer NLTK jika belum tersedia
nltk.download('punkt')

# Tokenisasi dokumen gabungan per topik (hasil dari docs_per_topic)
tokenized_topics = [word_tokenize(doc.lower()) for doc in docs_per_topic['cleaned_reviews']]

# Buat dictionary dan corpus untuk Gensim
dictionary = Dictionary(tokenized_topics)
corpus = [dictionary.doc2bow(text) for text in tokenized_topics]

# Ambil top-N kata dari c-TF-IDF untuk masing-masing topik
top_n = 10  # kamu bisa sesuaikan nilainya
top_words_per_topic = []
for idx, row in ctfidf_df.drop(columns=['cluster']).iterrows():
    sorted_words = row.sort_values(ascending=False)
    top_words = sorted_words.head(top_n).index.tolist()
    top_words_per_topic.append(top_words)

# Hitung coherence score menggunakan metric 'c_v'
coherence_model = CoherenceModel(
    topics=top_words_per_topic,
    texts=tokenized_topics,
    dictionary=dictionary,
    coherence='c_v'
)
coherence_score = coherence_model.get_coherence()

# Tampilkan hasil
print(f"Topic Coherence Score (c_v): {coherence_score:.4f}")


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment varia

Topic Coherence Score (c_v): 0.6428
