In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import hdbscan
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Load data hasil embedding + preprocessing review
embedding_df = pd.read_csv('embedding_umap.csv')  # Sesuaikan nama jika berbeda

# Cek kolom
embedding_df.head()


Unnamed: 0,Game,cleaned_Reviews,num_words,review_length,embedding_0,UMAP_1,UMAP_2
0,ACE COMBAT™ 7: SKIES UNKNOWN,major negative story character lame everything...,19,134,0.013184,17.112667,5.819002
1,ACE COMBAT™ 7: SKIES UNKNOWN,meh big meh bland storyline japanese melodrama...,28,178,-0.003744,18.61222,6.726864
2,ACE COMBAT™ 7: SKIES UNKNOWN,honestly insane I use play ace combat game ps ...,94,601,0.024339,19.02071,5.985852
3,ACE COMBAT™ 7: SKIES UNKNOWN,I never play anything series one fun dogfighti...,25,162,-0.047841,18.525732,6.311223
4,ACE COMBAT™ 7: SKIES UNKNOWN,I highway danger zone,4,21,0.013516,16.13822,5.633336


In [3]:
# Ambil kolom untuk clustering (biasanya hasil UMAP, misalnya: ['x', 'y'])
clusterer = hdbscan.HDBSCAN(min_cluster_size=5)
embedding_df['cluster'] = clusterer.fit_predict(embedding_df[['UMAP_1', 'UMAP_2']])
embedding_df['cluster'].value_counts()




cluster
-1     157
 3      19
 0      17
 7      16
 16     16
 5      14
 11     13
 8      12
 14     12
 12     10
 2       9
 10      9
 1       9
 6       9
 17      9
 13      7
 15      7
 4       6
 9       5
Name: count, dtype: int64

In [4]:
from scipy.sparse import csr_matrix

# Filter hanya cluster valid (selain -1)
valid_df = embedding_df[embedding_df['cluster'] != -1]

# Pastikan kolom cleaned_Reviews bertipe string dan tidak mengandung NaN
valid_df['cleaned_Reviews'] = valid_df['cleaned_Reviews'].fillna('').astype(str)

# Gabungkan review dalam tiap cluster
docs_per_topic = valid_df.groupby('cluster')['cleaned_Reviews'].apply(lambda x: ' '.join(x)).reset_index()

# TF Count
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs_per_topic['cleaned_Reviews'])
words = vectorizer.get_feature_names_out()

# Fungsi perhitungan c-TF-IDF
def compute_ctfidf(X, m):
    df = np.diff(X.tocsc().indptr)
    idf = np.log(m / (1 + df))
    ctfidf = X.multiply(idf)
    return ctfidf

# Hitung c-TF-IDF
m = len(docs_per_topic)
ctfidf = compute_ctfidf(X, m)
ctfidf_array = ctfidf.toarray()

# Buat DataFrame hasil c-TF-IDF
ctfidf_df = pd.DataFrame(ctfidf_array, columns=words)
ctfidf_df['cluster'] = docs_per_topic['cluster']
ctfidf_df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df['cleaned_Reviews'] = valid_df['cleaned_Reviews'].fillna('').astype(str)


Unnamed: 0,aaa,abertura,ability,abillity,able,absolute,absolutely,abysmally,ac,accepted,...,young,youtube,yr,yup,zb,zero,zeroeffort,zombie,zoom,zu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.197225,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Ambil 10 kata kunci teratas untuk tiap cluster
top_k = 10
top_keywords = {}

for i, row in ctfidf_df.iterrows():
    cluster = row['cluster']
    row = row.drop('cluster')
    top_words = row.sort_values(ascending=False).head(top_k).index.tolist()
    top_keywords[cluster] = top_words

# Lihat hasil
top_keywords


{0.0: ['woooooooooooooooo',
  'underrated',
  'goody',
  'yup',
  'among',
  'neat',
  'rip',
  'meh',
  'yet',
  'awesome'],
 1.0: ['horny',
  'depressed',
  'freakin',
  'ludicrous',
  'ass',
  'bore',
  'max',
  'laugh',
  'tedious',
  'fun'],
 2.0: ['shooter',
  'soul',
  'fragile',
  'base',
  'hunting',
  'gun',
  'devil',
  'chad',
  'paranoid',
  'onward'],
 3.0: ['game',
  'break',
  'perfec',
  'starve',
  'cow',
  'gr',
  'beeg',
  'chinese',
  'duck',
  'cheater'],
 4.0: ['jep',
  'outlast',
  'frog',
  'count',
  'breadman',
  'bean',
  'bore',
  'know',
  'well',
  'primarily'],
 5.0: ['crash',
  'choppy',
  'cpu',
  'screen',
  'file',
  'problem',
  'customer',
  'desktop',
  'answer',
  'gb'],
 6.0: ['duty',
  'battlefield',
  'cod',
  'ea',
  'ace',
  'series',
  'game',
  'steam',
  'technical',
  'launch'],
 7.0: ['enemy',
  'marine',
  'army',
  'game',
  'character',
  'feel',
  'space',
  'unit',
  'tech',
  'system'],
 8.0: ['multiplayer',
  'buy',
  'bundle',
 

In [6]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load model embedding (bisa diganti sesuai model BERT yang kamu pakai)
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Fungsi MMR
def mmr(doc_embedding, word_embeddings, words, top_n, diversity):
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding.reshape(1, -1))
    word_similarity = cosine_similarity(word_embeddings)

    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)
        mmr_dist = (1 - diversity) * candidate_similarities.reshape(-1) - diversity * target_similarities
        next_idx = candidates_idx[np.argmax(mmr_dist)]
        keywords_idx.append(next_idx)
        candidates_idx.remove(next_idx)

    return [words[i] for i in keywords_idx]


Couldn't import dot_parser, loading of dot files will not be possible.


In [7]:
# Simpan hasil akhir MMR per cluster
final_keywords = []

for i, row in ctfidf_df.iterrows():
    cluster = row['cluster']
    row = row.drop('cluster')
    
    # Ambil 20 kata dengan skor c-TF-IDF tertinggi
    top_words = row.sort_values(ascending=False).head(20)
    words = top_words.index.tolist()
    scores = top_words.values.tolist()

    # Buat embedding kata dan cluster
    word_embeddings = embedder.encode(words, convert_to_tensor=False)
    topic_embedding = np.mean(word_embeddings, axis=0)

    # Jalankan MMR
    selected_keywords = mmr(
        doc_embedding=topic_embedding,
        word_embeddings=word_embeddings,
        words=words,
        top_n=5,
        diversity=0.7
    )

    final_keywords.append({'cluster': cluster, 'keywords': selected_keywords})

# Konversi ke DataFrame
mmr_keywords_df = pd.DataFrame(final_keywords)
mmr_keywords_df.head()


Unnamed: 0,cluster,keywords
0,0.0,"[good, primaris, price, underrated, yup]"
1,1.0,"[stuff, ludicrous, quite, bore, physics]"
2,2.0,"[gun, ah, onward, base, chad]"
3,3.0,"[game, perfec, wait, cow, chinese]"
4,4.0,"[previous, prestigious, breadman, well, frog]"


In [8]:
# Contoh output
print(mmr_keywords_df)

# Simpan jika perlu
mmr_keywords_df.to_csv("final_topic_keywords.csv", index=False)


    cluster                                           keywords
0       0.0           [good, primaris, price, underrated, yup]
1       1.0           [stuff, ludicrous, quite, bore, physics]
2       2.0                      [gun, ah, onward, base, chad]
3       3.0                 [game, perfec, wait, cow, chinese]
4       4.0      [previous, prestigious, breadman, well, frog]
5       5.0   [problem, disapear, desktop, expedition, choppy]
6       6.0    [game, amazing, consistent, technical, texture]
7       7.0         [character, army, since, problem, another]
8       8.0  [sale, ever, boooooooooooooooooooooooooooooooo...
9       9.0  [trash, modernwarfare, please, publisher, hacker]
10     10.0            [war, noticeably, masculine, addon, fp]
11     11.0            [card, outside, clever, scenario, turn]
12     12.0                  [beat, loot, coop, really, color]
13     13.0                          [das, srie, man, se, uma]
14     14.0             [emotional, wait, blake, hour, 

In [9]:
# Mapping English labels
df = pd.read_csv("final_topic_keywords.csv")
topic_labels_en = {
    0: "Underrated games with good value",
    1: "Weird physics and boring gameplay",
    2: "Military/FPS games with weapons and tactics",
    3: "Anticipation for release/update (China)",
    4: "Internal references or specific developers",
    5: "Technical bugs and poor performance",
    6: "Positive graphics and technical performance",
    7: "Problematic characters",
    8: "Disappointment after discounted purchase",
    9: "Frustration with cheaters and publishers",
    10: "War themes and DLC",
    11: "Strategy games",
    12: "Co-op and loot system",
    13: "Non-English reviews",
    14: "Emotional narrative-driven games",
    15: "Horror, dual-wield, dark atmosphere",
    16: "Sci-fi games with outer space theme",
    17: "Cinematic games like movies"
}

# Tambahkan kolom baru ke DataFrame
df['topic_label_en'] = df['cluster'].map(topic_labels_en)

# Simpan ke file baru
df.to_csv("final_topic_keywords.csv", index=False)


In [10]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from nltk.tokenize import word_tokenize
import nltk

# Unduh tokenizer NLTK jika belum tersedia
nltk.download('punkt')

# Tokenisasi dokumen gabungan per topik (hasil dari docs_per_topic)
tokenized_topics = [word_tokenize(doc.lower()) for doc in docs_per_topic['cleaned_Reviews']]

# Buat dictionary dan corpus untuk Gensim
dictionary = Dictionary(tokenized_topics)
corpus = [dictionary.doc2bow(text) for text in tokenized_topics]

# Ambil top-N kata dari c-TF-IDF untuk masing-masing topik
top_n = 10  # kamu bisa sesuaikan nilainya
top_words_per_topic = []
for idx, row in ctfidf_df.drop(columns=['cluster']).iterrows():
    sorted_words = row.sort_values(ascending=False)
    top_words = sorted_words.head(top_n).index.tolist()
    top_words_per_topic.append(top_words)

# Hitung coherence score menggunakan metric 'c_v'
coherence_model = CoherenceModel(
    topics=top_words_per_topic,
    texts=tokenized_topics,
    dictionary=dictionary,
    coherence='c_v'
)
coherence_score = coherence_model.get_coherence()

# Tampilkan hasil
print(f"Topic Coherence Score (c_v): {coherence_score:.4f}")


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment varia

Topic Coherence Score (c_v): 0.6503
