In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import hdbscan
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Load data hasil embedding + preprocessing review
embedding_df = pd.read_csv('embedding_umap.csv')  # Sesuaikan nama jika berbeda

# Cek kolom
embedding_df.head()


Unnamed: 0,Game,cleaned_Reviews,num_words,review_length,embedding_0,UMAP_1,UMAP_2
0,Yakuza 0,well naruto,2,11,-0.017014,8.50981,5.420525
1,Far Cry® 4,I offensive I find game indian,6,30,0.018352,9.725788,2.938913
2,Need for Speed Undercover,game breakin glitche date physics full yr old ...,42,254,-0.1054,10.626174,0.852627
3,Wallpaper Engine,much hentai,2,11,0.032332,7.953412,3.899481
4,Raji: Prologue,dude say whatever wanna say remember indie gam...,15,91,0.046517,7.759619,0.929147


In [3]:
# Ambil kolom untuk clustering (biasanya hasil UMAP, misalnya: ['x', 'y'])
clusterer = hdbscan.HDBSCAN(min_cluster_size=5)
embedding_df['cluster'] = clusterer.fit_predict(embedding_df[['UMAP_1', 'UMAP_2']])
embedding_df['cluster'].value_counts()




cluster
-1      3955
 215     253
 19      178
 95      126
 168     105
        ... 
 358       5
 153       5
 96        5
 277       5
 27        5
Name: count, Length: 370, dtype: int64

In [4]:
from scipy.sparse import csr_matrix

# Filter hanya cluster valid (selain -1)
valid_df = embedding_df[embedding_df['cluster'] != -1]

# Pastikan kolom cleaned_Reviews bertipe string dan tidak mengandung NaN
valid_df['cleaned_Reviews'] = valid_df['cleaned_Reviews'].fillna('').astype(str)

# Gabungkan review dalam tiap cluster
docs_per_topic = valid_df.groupby('cluster')['cleaned_Reviews'].apply(lambda x: ' '.join(x)).reset_index()

# TF Count
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs_per_topic['cleaned_Reviews'])
words = vectorizer.get_feature_names_out()

# Fungsi perhitungan c-TF-IDF
def compute_ctfidf(X, m):
    df = np.diff(X.tocsc().indptr)
    idf = np.log(m / (1 + df))
    ctfidf = X.multiply(idf)
    return ctfidf

# Hitung c-TF-IDF
m = len(docs_per_topic)
ctfidf = compute_ctfidf(X, m)
ctfidf_array = ctfidf.toarray()

# Buat DataFrame hasil c-TF-IDF
ctfidf_df = pd.DataFrame(ctfidf_array, columns=words)
ctfidf_df['cluster'] = docs_per_topic['cluster']
ctfidf_df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df['cleaned_Reviews'] = valid_df['cleaned_Reviews'].fillna('').astype(str)


Unnamed: 0,aa,aaa,aaand,aaas,aar,aard,ab,abaixa,abandon,abandonware,...,zubmarine,zup,zupple,zwok,zwoki,zwyka,zwykego,zy,zzz,zzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Ambil 10 kata kunci teratas untuk tiap cluster
top_k = 20
top_keywords = {}

for i, row in ctfidf_df.iterrows():
    cluster = row['cluster']
    row = row.drop('cluster')
    top_words = row.sort_values(ascending=False).head(top_k).index.tolist()
    top_keywords[cluster] = top_words

# Lihat hasil
top_keywords


{0.0: ['payne',
  'max',
  'cutscene',
  'story',
  'man',
  'soundtrack',
  'bullet',
  'previous',
  'alan',
  'maxs',
  'bullettime',
  'third',
  'wake',
  'shooter',
  'series',
  'game',
  'james',
  'noir',
  'mp',
  'soon'],
 1.0: ['vr',
  'flight',
  'hvr',
  'alyx',
  'headset',
  'experience',
  'narration',
  'plane',
  'dcs',
  'vroom',
  'km',
  'gorn',
  'scenery',
  'certainly',
  'program',
  'set',
  'sim',
  'era',
  'quality',
  'immense'],
 2.0: ['port',
  'xbox',
  'dire',
  'pc',
  'boardgame',
  'digital',
  'mobile',
  'console',
  'bloodborne',
  'ps',
  'excellent',
  'wolf',
  'crappy',
  'lag',
  'file',
  'suggest',
  'version',
  'yeag',
  'worlf',
  'jankyness'],
 3.0: ['creed',
  'assassin',
  'ac',
  'ezio',
  'ezios',
  'unity',
  'odyssey',
  'rome',
  'series',
  'parkour',
  'reclaim',
  'assassins',
  'iii',
  'brotherhood',
  'refine',
  'slog',
  'wich',
  'since',
  'joy',
  'origin'],
 4.0: ['assassin',
  'ezio',
  'creed',
  'ac',
  'brotherh

In [6]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load model embedding (bisa diganti sesuai model BERT yang kamu pakai)
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Fungsi MMR
def mmr(doc_embedding, word_embeddings, words, top_n, diversity):
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding.reshape(1, -1))
    word_similarity = cosine_similarity(word_embeddings)

    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)
        mmr_dist = (1 - diversity) * candidate_similarities.reshape(-1) - diversity * target_similarities
        next_idx = candidates_idx[np.argmax(mmr_dist)]
        keywords_idx.append(next_idx)
        candidates_idx.remove(next_idx)

    return [words[i] for i in keywords_idx]


Couldn't import dot_parser, loading of dot files will not be possible.


In [7]:
# Simpan hasil akhir MMR per cluster
final_keywords = []

for i, row in ctfidf_df.iterrows():
    cluster = row['cluster']
    row = row.drop('cluster')
    
    # Ambil 20 kata dengan skor c-TF-IDF tertinggi
    top_words = row.sort_values(ascending=False).head(20)
    words = top_words.index.tolist()
    scores = top_words.values.tolist()

    # Buat embedding kata dan cluster
    word_embeddings = embedder.encode(words, convert_to_tensor=False)
    topic_embedding = np.mean(word_embeddings, axis=0)

    # Jalankan MMR
    selected_keywords = mmr(
        doc_embedding=topic_embedding,
        word_embeddings=word_embeddings,
        words=words,
        top_n=5,
        diversity=0.7
    )

    final_keywords.append({'cluster': cluster, 'keywords': selected_keywords})

# Konversi ke DataFrame
mmr_keywords_df = pd.DataFrame(final_keywords)
mmr_keywords_df.head()


Unnamed: 0,cluster,keywords
0,0.0,"[game, bullettime, alan, wake, maxs]"
1,1.0,"[vr, immense, set, alyx, gorn]"
2,2.0,"[pc, wolf, excellent, worlf, lag]"
3,3.0,"[ezios, parkour, since, series, slog]"
4,4.0,"[ezios, mature, series, naval, entry]"


In [8]:
# Contoh output
print(mmr_keywords_df)

# Simpan jika perlu
mmr_keywords_df.to_csv("final_topic_keywordsv2.csv", index=False)


     cluster                                           keywords
0        0.0               [game, bullettime, alan, wake, maxs]
1        1.0                     [vr, immense, set, alyx, gorn]
2        2.0                  [pc, wolf, excellent, worlf, lag]
3        3.0              [ezios, parkour, since, series, slog]
4        4.0              [ezios, mature, series, naval, entry]
..       ...                                                ...
364    364.0   [production, interested, storm, survive, method]
365    365.0  [character, borderland, fucking, principle, mr...
366    366.0  [character, nauseum, throughout, polished, bri...
367    367.0   [choose, sitandwait, compendium, building, ucke]
368    368.0     [building, different, xcom, diplomacy, banish]

[369 rows x 2 columns]


In [9]:
import ssl
import nltk

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Sekarang bisa download
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /Users/divaoncom/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/divaoncom/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from nltk.tokenize import word_tokenize
import nltk

# Unduh tokenizer NLTK jika belum tersedia
nltk.download('punkt')

# Tokenisasi dokumen gabungan per topik (hasil dari docs_per_topic)
tokenized_topics = [word_tokenize(doc.lower()) for doc in docs_per_topic['cleaned_Reviews']]

# Buat dictionary dan corpus untuk Gensim
dictionary = Dictionary(tokenized_topics)
corpus = [dictionary.doc2bow(text) for text in tokenized_topics]

# Ambil top-N kata dari c-TF-IDF untuk masing-masing topik
top_n = 10  # kamu bisa sesuaikan nilainya
top_words_per_topic = []
for idx, row in ctfidf_df.drop(columns=['cluster']).iterrows():
    sorted_words = row.sort_values(ascending=False)
    top_words = sorted_words.head(top_n).index.tolist()
    top_words_per_topic.append(top_words)

# Hitung coherence score menggunakan metric 'c_v'
coherence_model = CoherenceModel(
    topics=top_words_per_topic,
    texts=tokenized_topics,
    dictionary=dictionary,
    coherence='c_v'
)
coherence_score = coherence_model.get_coherence()

# Tampilkan hasil
print(f"Topic Coherence Score (c_v): {coherence_score:.4f}")


[nltk_data] Downloading package punkt to /Users/divaoncom/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The c

Topic Coherence Score (c_v): 0.5796
