In [1]:
import ssl
import nltk

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Sekarang bisa download
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /Users/divaoncom/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/divaoncom/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Import libraries
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re
import pickle

# Unduh resource NLTK
nltk.download("punkt")
nltk.download("stopwords")

# Load Data
df = pd.read_csv("Cleaned_Reviews.csv")

# Stopwords gabungan
nltk_stopwords = set(stopwords.words("english"))
custom_stopwords = nltk_stopwords.union({
    "game", "games", "player", "play", "played", "playing", "fun", "good", "great", "awesome", "cool"
})

# Preprocessing
def preprocess(text):
    text = re.sub(r"[^a-zA-Z\s]", "", str(text).lower())
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in custom_stopwords and len(token) > 2]
    return tokens

# Seed Topics
seed_topic_list = [
    ["gameplay", "mechanics", "combat system", "interaction", "controls", "gameplay features", "game flow"],
    ["bugs", "lag", "fps", "frame rate", "crashes", "performance issues", "optimization", "game stability"],
    ["score", "leaderboard", "ranking", "points system", "levels", "scoreboard", "achievements"],
    ["multiplayer", "co-op", "community", "online play", "player interaction", "multiplayer mode", "social features"],
    ["character design", "npc", "character customization", "avatars", "design features", "customizable characters"],
    ["soundtrack", "bgm", "audio design", "background music", "sound effects", "melody", "soundtrack quality"],
    ["plot", "narrative", "storyline", "story arc", "game plot", "ending", "character development", "game narrative"]
]

# Embedding model & vectorizer
embedding_model = SentenceTransformer("all-mpnet-base-v2")
vectorizer_model = CountVectorizer(stop_words=list(custom_stopwords), ngram_range=(1, 2))

# BERTopic Model
representation_model = KeyBERTInspired()
topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    seed_topic_list=seed_topic_list,
    language="english",
    verbose=True,
    min_topic_size=20,
    representation_model=representation_model
)

# Proses dokumen
documents = df["Cleaned_Review"].astype(str).tolist()
topics, _ = topic_model.fit_transform(documents)
df["Topic"] = topics

# Tokenisasi untuk evaluasi coherence
tokenized_docs = [preprocess(doc) for doc in documents]

# ===== Simpan model & data =====
with open("topic_model.pkl", "wb") as f:
    pickle.dump(topic_model, f)

with open("tokenized_docs.pkl", "wb") as f:
    pickle.dump(tokenized_docs, f)

df.to_csv("DataPerGameTopics.csv", index=False)
print("📁 Model dan tokenisasi disimpan. Siap untuk perhitungan coherence di Cell 2.")


AttributeError: module 'openai' has no attribute 'OpenAI'

In [None]:
# Import KeyBERT
from keybert import KeyBERT

# Fungsi untuk melakukan labeling topik menggunakan KeyBERT
def label_topics_with_keybert(model, documents, n_keywords=3):
    topic_labels = []
    kw_model = KeyBERT()

    # Iterasi melalui semua topik yang dihasilkan dan dokumentasi
    for topic_id in range(model.get_topic_info().shape[0]):
        # Ambil kata-kata dari setiap topik
        topic_words = model.get_topic(topic_id)
        
        # Ambil kata-kata kunci yang relevan dari topik
        top_keywords = [word for word, _ in topic_words]
        
        # Gunakan KeyBERT untuk menghasilkan label berdasarkan kata kunci topik
        label = kw_model.extract_keywords(" ".join(top_keywords), top_n=n_keywords, use_mmr=True, diversity=0.7)
        
        # Buat label untuk topik berdasarkan kata kunci yang diekstrak
        topic_label = ", ".join([word for word, _ in label])
        topic_labels.append(topic_label)
        
    return topic_labels

# Load model yang sudah disimpan
with open("topic_model.pkl", "rb") as f:
    topic_model = pickle.load(f)

# Proses untuk memberikan label topik menggunakan KeyBERT
try:
    # Menghasilkan label topik menggunakan KeyBERT
    topic_labels = label_topics_with_keybert(topic_model, documents)
    
    # Setel label ke model
    topic_model.set_topic_labels(topic_labels)
    print("✅ LLM-based topic labeling menggunakan KeyBERT berhasil.")
except Exception as e:
    print("⚠️ Gagal membuat label LLM dengan KeyBERT: ", e)

# Simpan model dengan label topik baru
with open("topic_model_with_labels.pkl", "wb") as f:
    pickle.dump(topic_model, f)


In [None]:
# Menghitung coherence score menggunakan BERTopic
from bertopic import BERTopic
from bertopic.vectorizers import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Menghitung coherence score
coherence_score = topic_model.get_coherence()

print(f"Coherence Score: {coherence_score}")
