In [5]:
import ssl
import nltk

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Sekarang bisa download
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /Users/divaoncom/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/divaoncom/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [1]:
pip install bertopic sentence-transformers pandas scikit-learn

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel


def load_data_from_file(path: str = "PreprocessedReviews.csv") -> pd.DataFrame:
    """Membaca CSV dan memastikan kolom yang dibutuhkan tersedia."""
    df = pd.read_csv(path)
    expected_cols = {"UserID", "Game", "Genre", "Review", "Cleaned_Review"}
    missing = expected_cols.difference(df.columns)
    if missing:
        raise ValueError(f"Kolom berikut tidak ditemukan di CSV: {missing}")
    return df


def build_model(df: pd.DataFrame) -> tuple[BERTopic, list[int]]:
    """Melatih BERTopic di seluruh korpus ulasan."""
    vectorizer_model = CountVectorizer(
        ngram_range=(1, 2),
        stop_words="english",  # ubah karena review berbahasa Inggris
    )
    topic_model = BERTopic(
        embedding_model="all-MiniLM-L6-v2",
        vectorizer_model=vectorizer_model,
        language="english",
        calculate_probabilities=False,
        verbose=True,
        min_topic_size=20,
    )

    docs = df["Cleaned_Review"].astype(str).tolist()
    topics, _ = topic_model.fit_transform(docs)
    df["Topic"] = topics
    return topic_model, topics


def aggregate_topics_by_game(df: pd.DataFrame, topic_model: BERTopic, top_n: int = 3) -> pd.DataFrame:
    """Menghasilkan tabel Game -> daftar topik terpopuler (diberi label kata kunci)."""
    topic_info = topic_model.get_topic_info()
    id2label = {
        row["Topic"]: (
            ", ".join([w for w, _ in topic_model.get_topic(row["Topic"])[:5]])
            if row["Topic"] != -1
            else "Outlier"
        )
        for _, row in topic_info.iterrows()
    }

    def mode_topics(series: pd.Series):
        top_ids = (
            series[series != -1]
            .value_counts()
            .head(top_n)
            .index.tolist()
        )
        return "; ".join(id2label[i] for i in top_ids)

    summary = (
        df.groupby("Game")["Topic"]
        .apply(mode_topics)
        .reset_index(name="Top Topics")
        .sort_values("Game")
    )
    return summary


def compute_coherence_score(topic_model: BERTopic, documents: list[str]) -> float:
    """Menghitung coherence score menggunakan Gensim."""
    topics = topic_model.get_topics()
    topic_words = [[word for word, _ in words] for _, words in topics.items() if _ != -1]

    tokenized_docs = [doc.split() for doc in documents]
    dictionary = Dictionary(tokenized_docs)
    corpus = [dictionary.doc2bow(text) for text in tokenized_docs]

    coherence_model = CoherenceModel(
        topics=topic_words,
        texts=tokenized_docs,
        corpus=corpus,
        dictionary=dictionary,
        coherence='c_v'
    )
    return coherence_model.get_coherence()


# Contoh pemanggilan di Jupyter Notebook
if __name__ == "__main__":
    df = load_data_from_file("PreprocessedReviews.csv")
    print("[1/4] Data dimuat")

    topic_model, _ = build_model(df)
    print("[2/4] Model dilatih")

    coherence = compute_coherence_score(topic_model, df["Cleaned_Review"].astype(str).tolist())
    print(f"[3/4] Coherence Score: {coherence:.4f}")

    result = aggregate_topics_by_game(df, topic_model, top_n=3)
    print("[4/4] Topik per game berhasil dihasilkan")

    result.to_csv("TopTopicsByGame.csv", index=False)
    print("[5/5] Hasil disimpan sebagai TopTopicsByGame.csv")

    display(result.head())

2025-06-02 20:54:48,224 - BERTopic - Embedding - Transforming documents to embeddings.


[1/4] Data dimuat


Batches:   0%|          | 0/299 [00:00<?, ?it/s]

2025-06-02 20:55:17,349 - BERTopic - Embedding - Completed ✓
2025-06-02 20:55:17,352 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-02 20:55:21,687 - BERTopic - Dimensionality - Completed ✓
2025-06-02 20:55:21,691 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-02 20:55:21,992 - BERTopic - Cluster - Completed ✓
2025-06-02 20:55:22,003 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-06-02 20:55:22,873 - BERTopic - Representation - Completed ✓


[2/4] Model dilatih


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[3/4] Coherence Score: 0.5063
[4/4] Topik per game berhasil dihasilkan
[5/5] Hasil disimpan sebagai TopTopicsByGame.csv


Unnamed: 0,Game,Top Topics
0,#KILLALLZOMBIES,"roguelite, roguelike, fun, twin stick, twin"
1,#monstercakes,
2,(the) Gnorp Apologue,"building, city, game, base, fun"
3,*NEW* SCUFFED EPIC BHOP SIMULATOR 2023 (POG CH...,"nan, good good, nan nan, good, nan good"
4,---Red---Tether-->,


In [1]:
# bertopic_per_game.py
"""
Topic modelling BERTopic **per game** (bukan global) untuk data Steam reviews.

Fitur kunci
-----------
1. **Per-Game Modelling** – Setiap game dianalisis sendiri-sendiri agar topik benar-benar merefleksikan review game tersebut.
2. **Embedding Lebih Hemat Memori** – Default `all-MiniLM-L6-v2` (bisa ganti ke `all-mpnet-base-v2` kalau VRAM cukup).
3. **Stop-words Kustom** – Stop word English + kata umum domain game ("game", "play", "player", "good", "fun", dll.).
4. **Batas Minimal Review** – Hanya memproses game dengan ≥ `MIN_REVIEWS` review bersih.
5. **Output CSV** – `PerGameTopics.csv` berisi kolom `Game`, `Top Topics` (3 teratas) + `Coherence`.

> **Instalasi**  
> ```bash
> pip install bertopic sentence-transformers pandas scikit-learn gensim tqdm umap-learn
> ```

---
"""

from __future__ import annotations

import gc
import warnings
from pathlib import Path
from typing import List, Dict

import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
import umap
from tqdm import tqdm
import torch
from gensim.corpora.dictionary import Dictionary
from gensim.models import CoherenceModel

##########################
# Konfigurasi
##########################
CSV_PATH = "PreprocessedReviews.csv"  # nama file input
OUTPUT_CSV = "PerGameTopics.csv"      # nama file output
MIN_REVIEWS = 3                       # minimal review bersih per game
TOP_N = 3                             # berapa topik teratas yang disimpan
# Pilih model embedding. MiniLM jauh lebih ringan; ganti jika perlu.
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"  # atau "all-mpnet-base-v2" jika VRAM cukup
DEVICE = "cpu"  # "cpu" | "mps" | "cuda"

# Stop word tambahan spesifik game
DOMAIN_STOPWORDS = {
    # kata umum yang sebelumnya sudah ada
    "game", "games", "play", "playing", "player", "players", "good", "fun", "really",
    "like", "just", "get", "got", "make", "makes", "made", "time", "one", "also",
    # kata tambahan agar topik tidak bias kata generik game
    "steam", "hours", "hour", "minutes", "minute", "update", "updates", "bug", "bugs",
    "graphics", "story", "level", "levels", "mission", "missions", "enemy", "enemies",
    "character", "characters", "weapon", "weapons", "map", "maps", "lot", "many",
    "still", "even", "would", "could", "should", "well"
}
STOPWORDS = DOMAIN_STOPWORDS.union(ENGLISH_STOP_WORDS)

##########################
# Fungsi utilitas
##########################

def load_data(path: str | Path = CSV_PATH) -> pd.DataFrame:
    """Muat CSV dan validasi kolom yang dibutuhkan."""
    df = pd.read_csv(path)
    required = {"Game", "Cleaned_Review"}
    missing = required.difference(df.columns)
    if missing:
        raise ValueError(f"Kolom hilang: {missing}")
    return df


def build_topic_model(num_docs: int) -> BERTopic:
    """Bangun BERTopic dengan parameter dinamis tergantung banyaknya dokumen."""
    # Embedding
    embedder = SentenceTransformer(EMBEDDING_MODEL_NAME, device=DEVICE)

    # Vectorizer
    vectorizer = CountVectorizer(
        ngram_range=(1, 2),
        stop_words=list(STOPWORDS),
    )

    # UMAP – sesuaikan n_neighbors agar <= num_docs - 1 untuk mencegah error k >= N
    n_neighbors = max(2, min(15, num_docs - 1))
    umap_model = umap.UMAP(
        n_neighbors=n_neighbors,
        n_components=5,
        metric="cosine",
        random_state=42,
    )

    # min_topic_size tidak boleh melebihi num_docs
    min_topic_size = max(2, min(5, num_docs))

    return BERTopic(
        embedding_model=embedder,
        vectorizer_model=vectorizer,
        umap_model=umap_model,
        language="english",
        calculate_probabilities=False,
        verbose=False,
        min_topic_size=min_topic_size,
    )


def label_top_n(topic_model: BERTopic, top_n: int = TOP_N) -> str:
    """Gabungkan label `top_n` topik terpopuler sebagai satu string."""
    info = topic_model.get_topic_info()
    info = info[info["Topic"] != -1].nlargest(top_n, "Count")
    labels: List[str] = []
    for tid in info.Topic:
        words = ", ".join([w for w, _ in topic_model.get_topic(tid)[:5]])
        labels.append(words)
    return "; ".join(labels)


def coherence_score(topic_model: BERTopic, docs: List[str]) -> float:
    """Hitung coherence score (c_v) untuk model dan dokumen yang diberikan."""
    tokens = [doc.lower().split() for doc in docs]
    topics_dict = topic_model.get_topics()
    top_words = [[word for word, _ in topics_dict[tid][:10]] for tid in topics_dict if tid != -1]
    if not top_words:
        return 0.0
    dictionary = Dictionary(tokens)
    corpus = [dictionary.doc2bow(text) for text in tokens]
    cm = CoherenceModel(topics=top_words, texts=tokens, dictionary=dictionary, coherence="c_v")
    return cm.get_coherence()


def process_game(game: str, docs: List[str]) -> Dict[str, str | float]:
    """Latih model, kembalikan label topik teratas dan coherence score."""
    model = build_topic_model(len(docs))
    try:
        model.fit(docs)
        label = label_top_n(model)
        coh = coherence_score(model, docs)
    except Exception as e:
        warnings.warn(f"{game}: {e}")
        label = "Insufficient data"
        coh = 0.0
    finally:
        # Bersihkan memori setiap iterasi untuk mencegah OOM
        del model
        gc.collect()
        if DEVICE == "mps" and torch.backends.mps.is_available():
            torch.mps.empty_cache()
    return {"Game": game, "Top Topics": label, "Coherence": coh}

##########################
# Pipeline utama
##########################

def main() -> None:
    print("[1/4] Memuat data …")
    df = load_data()

    print("[2/4] Mengelompokkan review per game …")
    grouped: Dict[str, List[str]] = (
        df.groupby("Game")["Cleaned_Review"].apply(list).to_dict()
    )

    print("[3/4] Melatih BERTopic untuk tiap game …")
    results: List[Dict[str, str | float]] = []
    for game, docs in tqdm(grouped.items(), desc="Games"):
        if len(docs) < MIN_REVIEWS:
            continue
        results.append(process_game(game, docs))

    if not results:
        raise RuntimeError("Tidak ada game yang memenuhi syarat MIN_REVIEWS.")

    print("[4/4] Menyimpan hasil …")
    pd.DataFrame(results).sort_values("Game").to_csv(OUTPUT_CSV, index=False)
    print(f"Selesai! Hasil tersimpan di {OUTPUT_CSV}")


if __name__ == "__main__":
    main()


Couldn't import dot_parser, loading of dot files will not be possible.
[1/4] Memuat data …
[2/4] Mengelompokkan review per game …
[3/4] Melatih BERTopic untuk tiap game …


Games:   0%|          | 0/5064 [00:00<?, ?it/s]OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.

k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.


k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.


k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.


k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.


k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.


k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.


k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.


k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.


k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.


k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.


k >= N for N * N square matrix. Attempting to use scipy.linalg.ei

[4/4] Menyimpan hasil …
Selesai! Hasil tersimpan di PerGameTopics.csv





In [7]:
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re
import pickle

# Unduh resource NLTK
nltk.download("punkt")
nltk.download("stopwords")

# Load Data
df = pd.read_csv("Cleaned_Reviews.csv")

# Stopwords gabungan
nltk_stopwords = set(stopwords.words("english"))
custom_stopwords = nltk_stopwords.union({
    "game", "games", "player", "play", "played", "playing", "fun", "good", "great", "awesome", "cool"
})

# Preprocessing
def preprocess(text):
    text = re.sub(r"[^a-zA-Z\s]", "", str(text).lower())
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in custom_stopwords and len(token) > 2]
    return tokens

# Seed Topics
seed_topic_list = [
    ["gameplay", "mechanics", "combat system", "interaction", "controls", "gameplay features", "game flow"],
    ["bugs", "lag", "fps", "frame rate", "crashes", "performance issues", "optimization", "game stability"],
    ["score", "leaderboard", "ranking", "points system", "levels", "scoreboard", "achievements"],
    ["multiplayer", "co-op", "community", "online play", "player interaction", "multiplayer mode", "social features"],
    ["character design", "npc", "character customization", "avatars", "design features", "customizable characters"],
    ["soundtrack", "bgm", "audio design", "background music", "sound effects", "melody", "soundtrack quality"],
    ["plot", "narrative", "storyline", "story arc", "game plot", "ending", "character development", "game narrative"]
]

# Embedding model & vectorizer
embedding_model = SentenceTransformer("all-mpnet-base-v2")
vectorizer_model = CountVectorizer(stop_words=list(custom_stopwords), ngram_range=(1, 2))

# BERTopic Model
representation_model = KeyBERTInspired()
topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    seed_topic_list=seed_topic_list,
    language="english",
    verbose=True,
    min_topic_size=20,
    representation_model=representation_model
)

# Proses dokumen
documents = df["Cleaned_Review"].astype(str).tolist()
topics, _ = topic_model.fit_transform(documents)
df["Topic"] = topics

# Tokenisasi untuk evaluasi coherence
tokenized_docs = [preprocess(doc) for doc in documents]

# ===== LLM-Based Topic Labeling =====
try:
    topic_labels = topic_model.generate_topic_labels(documents)
    topic_model.set_topic_labels(topic_labels)
    print("✅ LLM-based topic labeling berhasil.")
except Exception as e:
    print("⚠️ Gagal membuat label LLM: ", e)

# ===== Simpan model & data untuk Cell berikutnya =====
with open("topic_model.pkl", "wb") as f:
    pickle.dump(topic_model, f)

with open("tokenized_docs.pkl", "wb") as f:
    pickle.dump(tokenized_docs, f)

df.to_csv("DataPerGameTopics.csv", index=False)
print("📁 Model dan tokenisasi disimpan. Siap untuk perhitungan coherence di Cell 2.")


[nltk_data] Downloading package punkt to /Users/divaoncom/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/divaoncom/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2025-06-03 12:05:47,648 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/364 [00:00<?, ?it/s]

2025-06-03 12:09:21,471 - BERTopic - Embedding - Completed ✓
2025-06-03 12:09:21,476 - BERTopic - Guided - Find embeddings highly related to seeded topics.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-03 12:09:21,888 - BERTopic - Guided - Completed ✓
2025-06-03 12:09:21,899 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-03 12:09:26,350 - BERTopic - Dimensionality - Completed ✓
2025-06-03 12:09:26,355 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-03 12:09:26,723 - BERTopic - Cluster - Completed ✓
2025-06-03 12:09:26,748 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-06-03 12:10:10,251 - BERTopic - Representation - Completed ✓


⚠️ Gagal membuat label LLM:  slice indices must be integers or None or have an __index__ method
📁 Model dan tokenisasi disimpan. Siap untuk perhitungan coherence di Cell 2.


# Fixed BERTopic for Games

In [14]:
pip install bertopic

Defaulting to user installation because normal site-packages is not writeable
Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Downloading bertopic-0.17.0-py3-none-any.whl (150 kB)
Installing collected packages: bertopic
Successfully installed bertopic-0.17.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
# Import libraries
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re
import pickle

# Unduh resource NLTK
nltk.download("punkt")
nltk.download("stopwords")

# Load Data
df = pd.read_csv("Cleaned_Reviews.csv")

# Stopwords gabungan
nltk_stopwords = set(stopwords.words("english"))
custom_stopwords = nltk_stopwords.union({
    "game", "games", "player", "play", "played", "playing", "fun", "good", "great", "awesome", "cool"
})

# Preprocessing
def preprocess(text):
    text = re.sub(r"[^a-zA-Z\s]", "", str(text).lower())
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in custom_stopwords and len(token) > 2]
    return tokens

# Seed Topics
seed_topic_list = [
    ["gameplay", "mechanics", "combat system", "interaction", "controls", "gameplay features", "game flow"],
    ["bugs", "lag", "fps", "frame rate", "crashes", "performance issues", "optimization", "game stability"],
    ["score", "leaderboard", "ranking", "points system", "levels", "scoreboard", "achievements"],
    ["multiplayer", "co-op", "community", "online play", "player interaction", "multiplayer mode", "social features"],
    ["character design", "npc", "character customization", "avatars", "design features", "customizable characters"],
    ["soundtrack", "bgm", "audio design", "background music", "sound effects", "melody", "soundtrack quality"],
    ["plot", "narrative", "storyline", "story arc", "game plot", "ending", "character development", "game narrative"]
]

# Embedding model & vectorizer
embedding_model = SentenceTransformer("all-mpnet-base-v2")
vectorizer_model = CountVectorizer(stop_words=list(custom_stopwords), ngram_range=(1, 2))

# BERTopic Model
representation_model = KeyBERTInspired()
topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    seed_topic_list=seed_topic_list,
    language="english",
    verbose=True,
    min_topic_size=20,
    representation_model=KeyBERTInspired()  # Pastikan tidak ada dependensi OpenAI
)

# Proses dokumen
documents = df["Cleaned_Review"].astype(str).tolist()
topics, _ = topic_model.fit_transform(documents)
df["Topic"] = topics

# Tokenisasi untuk evaluasi coherence
tokenized_docs = [preprocess(doc) for doc in documents]

# ===== Simpan model & data =====
with open("topic_model.pkl", "wb") as f:
    pickle.dump(topic_model, f)

with open("tokenized_docs.pkl", "wb") as f:
    pickle.dump(tokenized_docs, f)

df.to_csv("DataPerGameTopics.csv", index=False)
print("📁 Model dan tokenisasi disimpan. Siap untuk perhitungan coherence di Cell 2.")


[nltk_data] Downloading package punkt to /Users/divaoncom/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/divaoncom/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2025-06-03 16:30:22,193 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/364 [00:00<?, ?it/s]

2025-06-03 16:37:05,847 - BERTopic - Embedding - Completed ✓
2025-06-03 16:37:05,853 - BERTopic - Guided - Find embeddings highly related to seeded topics.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-03 16:37:06,543 - BERTopic - Guided - Completed ✓
2025-06-03 16:37:06,545 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2025-06-03 16:37:20,986 - BERTopic - Dimensionality - Completed ✓
2025-06-03 16:37:20,988 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-03 16:37:21,315 - BERTopic - Cluster - Completed ✓
2025-06-03 16:37:21,323 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-03 16:38:07,102 - BERTopic - Representation - Completed ✓


📁 Model dan tokenisasi disimpan. Siap untuk perhitungan coherence di Cell 2.


In [22]:
pip install tqdm

Python(52064) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Cell 1: Melabeli topik dengan DistilGPT-2
import pickle
import pandas as pd
from tqdm import tqdm
from transformers import pipeline

# Memuat model BERTopic yang telah dilatih
with open("topic_model.pkl", "rb") as f:
    topic_model = pickle.load(f)

# Fungsi labeling topik dengan DistilGPT-2
def label_topics_with_distilgpt2(model, n_keywords=5):
    topic_labels = {}
    generator = pipeline('text-generation', model='distilgpt2')

    # Ambil daftar topik dari model
    topics_dict = model.get_topics()

    for topic_id, word_weights in tqdm(topics_dict.items(), desc="Labeling topics", unit="topic"):
        if topic_id == -1:
            continue  # Lewati outlier topic

        word_list = [word for word, _ in word_weights[:n_keywords]]
        prompt = " ".join(word_list)

        try:
            response = generator(prompt, max_length=20, num_return_sequences=1)
            label = response[0]['generated_text'].strip()
            topic_labels[topic_id] = label
            print(f"Topik {topic_id}: ✅ Label → {label}")
        except Exception as e:
            print(f"Topik {topic_id}: ❌ Error → {e}")
            topic_labels[topic_id] = f"Topik {topic_id}"

    return topic_labels

# Jalankan fungsi labeling
topic_labels = label_topics_with_distilgpt2(topic_model)

# Tetapkan label ke BERTopic model
topic_model.set_topic_labels(topic_labels)

print("✅ Label topik berhasil disimpan ke dalam model.")


Device set to use mps:0
Labeling topics:   0%|          | 0/92 [00:00<?, ?topic/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:   2%|▏         | 2/92 [00:06<04:46,  3.19s/topic]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 0: ✅ Label → xcom battlefield cod gameplay strategy games.


Labeling topics:   3%|▎         | 3/92 [00:07<03:28,  2.34s/topic]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 1: ✅ Label → gameplay rpg characters review bad decisions.


Labeling topics:   4%|▍         | 4/92 [00:07<02:19,  1.59s/topic]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 2: ✅ Label → downloadable content content downloadable dlcs dlc season pass


Labeling topics:   5%|▌         | 5/92 [00:08<01:41,  1.17s/topic]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 3: ✅ Label → puzzles adventure puzzle portal challenging new challenges


"The biggest question I keep returning is


Labeling topics:   7%|▋         | 6/92 [00:08<01:26,  1.00s/topic]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 4: ✅ Label → keeps crashing glitching constantly crashes crashes lag, but at this point I assume that it will fail


Labeling topics:   8%|▊         | 7/92 [00:09<01:09,  1.22topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:   9%|▊         | 8/92 [00:09<00:51,  1.62topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 5: ✅ Label → nan nan nan nan darn tootin nan tootin nan tootin nan tootin nan tootin nan
Topik 6: ✅ Label → horror experience horror silent hill psychological horror survival horror fiction


Labeling topics:  10%|▉         | 9/92 [00:09<00:40,  2.04topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 7: ✅ Label → hearthstone deckbuilders deck decks deckbuilder deckbuilders build your strategies


Labeling topics:  11%|█         | 10/92 [00:10<00:43,  1.90topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  12%|█▏        | 11/92 [00:10<00:33,  2.44topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 8: ✅ Label → cancer disorders diseases disease personality disease and disability disability
Topik 9: ✅ Label → 1010 1010 amazing 1010 would 1010 best 1010 1010 best 1010 best 101


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 10: ✅ Label → po czasie en en iyi bir oyun bu oyunu.

Oy


Labeling topics:  14%|█▍        | 13/92 [00:10<00:23,  3.35topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  15%|█▌        | 14/92 [00:10<00:19,  3.94topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 11: ✅ Label → city builder base building build recommend early access to property market from local developers and developers alike. Build is
Topik 12: ✅ Label → roguelike roguelikes like roguelike roguelite roguelites like roguelike


Labeling topics:  16%|█▋        | 15/92 [00:11<00:17,  4.32topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 13: ✅ Label → gameplay sequel deus ex original adventures remaster.


Labeling topics:  17%|█▋        | 16/92 [00:11<00:17,  4.44topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 14: ✅ Label → well gam well well enough enough boy gam a bit, but it's something you can't do without
Topik 15: ✅ Label → best coop coop experience coop coop coop mission coop coop experience coop


Labeling topics:  20%|█▉        | 18/92 [00:11<00:13,  5.47topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  21%|██        | 19/92 [00:11<00:13,  5.50topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 16: ✅ Label → best ever ever greatest best ive greatest favorite best 1 month ago
Topik 17: ✅ Label → boardgame extremely addictive addictive highly recommend learning curve, but for a beginner in any type of gaming,


Labeling topics:  22%|██▏       | 20/92 [00:11<00:12,  5.60topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  23%|██▎       | 21/92 [00:12<00:12,  5.67topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 18: ✅ Label → amazing start nice nice nice nice story really nice fun fun story nice nice nice nice nice story good
Topik 19: ✅ Label → kid named named wheely named snail wheely kid grass kid cat named leopard-type


Labeling topics:  24%|██▍       | 22/92 [00:12<00:19,  3.52topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  25%|██▌       | 23/92 [00:12<00:17,  3.93topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 20: ✅ Label → shadowrunning samurai samurai invasion war emperor samurai emperor samurai samurai emperor samurai emperor samurai samurai samurai samurai samurai samurai
Topik 21: ✅ Label → like funnny funnny hours entertainment funnny lov hogwarts come on the board of that game!


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  27%|██▋       | 25/92 [00:13<00:13,  4.90topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 22: ✅ Label → meh meh bad meh meh solid meh meh basically fuck me, oh m
Topik 23: ✅ Label → bullets reload rest bullets innovative shooter person shooter tactical shooter tactical shooter sniper shooter combat RPG action shooter tactical


Labeling topics:  28%|██▊       | 26/92 [00:13<00:13,  4.74topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  29%|██▉       | 27/92 [00:13<00:13,  4.95topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 24: ✅ Label → gameplay novel characters recommend character choices for a particular character.
Topik 25: ✅ Label → story mode pilgrim saga walking dead narrative genre world world world of adventures and events


Labeling topics:  30%|███       | 28/92 [00:13<00:12,  5.09topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  32%|███▏      | 29/92 [00:13<00:11,  5.55topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 26: ✅ Label → combat system move infantry operation northwoods strategy combat







Echoing
Topik 27: ✅ Label → tis yup definitely worth yup bad yes goody buy me a bady, no goody


Labeling topics:  33%|███▎      | 30/92 [00:14<00:11,  5.43topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  34%|███▎      | 31/92 [00:14<00:09,  6.15topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 28: ✅ Label → ok fine ok ok perfect ok wonderful ok ok ok ok ok fine right ok fine ok ok ok ok
Topik 29: ✅ Label → perfect perfect perfect perfect fuuny fuuny perfect fuuny fuuny fuuny


Labeling topics:  35%|███▍      | 32/92 [00:14<00:10,  5.65topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  36%|███▌      | 33/92 [00:14<00:09,  6.20topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 30: ✅ Label → ea valve ea app servers devs in the last 10 years with my 3rd beta release. Also
Topik 31: ✅ Label → motorsport realistic driving much racing racing like mario karten - with a little 'ch


Labeling topics:  37%|███▋      | 34/92 [00:14<00:08,  6.76topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  38%|███▊      | 35/92 [00:14<00:08,  6.36topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 32: ✅ Label → gta gta4 gta5 gta online gta iv kyaa6 kya
Topik 33: ✅ Label → remastered looks remastered remaster remake gameplay: A look at the final look at the final look


Labeling topics:  39%|███▉      | 36/92 [00:14<00:08,  6.38topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  40%|████      | 37/92 [00:15<00:09,  5.93topic/s]

Topik 34: ✅ Label → excellent port port excellent terrible port nice port final fantasy port
Topik 35: ✅ Label → pay win free doesnt free pay free hacks to improve the service. If you have any feedback on the


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  41%|████▏     | 38/92 [00:15<00:09,  5.44topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 36: ✅ Label → balls balls balls hurt balls ball shot balls balls balls balls with balls from balls from balls from balls from


Labeling topics:  42%|████▏     | 39/92 [00:15<00:16,  3.22topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  43%|████▎     | 40/92 [00:16<00:14,  3.69topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 37: ✅ Label → 10 10 10 10 one one !! It‪^^- ^^^- ^^
Topik 38: ✅ Label → price indie price price hike indie cost already expensive price in the UK.


The deal is


Labeling topics:  45%|████▍     | 41/92 [00:16<00:12,  4.17topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 39: ✅ Label → building machines life furniture automate building characters free ai-dots. This allows you to easily create


Labeling topics:  46%|████▌     | 42/92 [00:16<00:11,  4.35topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 40: ✅ Label → yes yep yeah yes yes ye ha ha ha ha that he said yes a lot yep yes


Labeling topics:  47%|████▋     | 43/92 [00:16<00:11,  4.42topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  48%|████▊     | 44/92 [00:16<00:09,  4.85topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 41: ✅ Label → amazing incredible amazing amazing magnificent magnificent amazing amazing amazing amazing incredible awesome amazing amazing amazing amazing awesome amazing amazing
Topik 42: ✅ Label → xbox controller ps4 controller quest controllers controllers controller icons that give people some interesting and interesting features.


Labeling topics:  49%|████▉     | 45/92 [00:17<00:08,  5.53topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  50%|█████     | 46/92 [00:17<00:07,  5.83topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 43: ✅ Label → soundtrack old soundtrack murdering audio music dissapointing love soundtrack
Topik 44: ✅ Label → cops robbers splinter cell robocop swat mafia gang.


Labeling topics:  51%|█████     | 47/92 [00:17<00:07,  5.77topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  52%|█████▏    | 48/92 [00:17<00:06,  6.54topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 45: ✅ Label → 100 hours 1000 hours hours 20 hours 29 hours 23 minutes 30 minutes 30 minutes 35 minutes 34 minutes 36
Topik 46: ✅ Label → max paynes max payne payne max payne series payne max payne max payne


Labeling topics:  53%|█████▎    | 49/92 [00:17<00:06,  7.11topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  54%|█████▍    | 50/92 [00:17<00:05,  7.50topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 47: ✅ Label → brilliant rpg rpg fantastic amazing atmosphere football brilliant amazing amazing football nice graphics bad to work
Topik 48: ✅ Label → leaderboard see leaderboard leaderboard unlikely leaderboard end functional scoreboard


Labeling topics:  55%|█████▌    | 51/92 [00:17<00:05,  7.00topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 49: ✅ Label → positive review negative reviews negative review review positive reviews buy reviews Positive reviews No reviews A negative review I will
Topik 50: ✅ Label → best zombies zombie world zombies best horror zombiemassengemetzel best zombie zombies zombie death zombie


Labeling topics:  58%|█████▊    | 53/92 [00:18<00:05,  7.75topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  59%|█████▊    | 54/92 [00:18<00:04,  8.18topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 51: ✅ Label → gameplay stellar gameplay entertaining gameplay brilliant nice gameplay gameplay art and great gameplay gameplay and a high replay value
Topik 52: ✅ Label → esse jogo jogo tem desse jogo um jogo em termos pueh en


Labeling topics:  60%|█████▉    | 55/92 [00:18<00:05,  6.64topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  61%|██████    | 56/92 [00:18<00:05,  6.37topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 53: ✅ Label → everything ok fine ok alr ok ok nevermind ok fine ok ok ok
Topik 54: ✅ Label → tower defense tower defence rogue tower like tower best tower defense tower shield tower as well. We‏


Labeling topics:  62%|██████▏   | 57/92 [00:18<00:04,  7.05topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  63%|██████▎   | 58/92 [00:18<00:04,  6.94topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 55: ✅ Label → kingdom hearts infinitely replayable replayability onetime ride life enjoy repetitive, thrilling and fun time-
Topik 56: ✅ Label → soundtrack soundtracks failed soundtrackmusic background music fine music background music background music soundtracks failed soundtrack music


Labeling topics:  64%|██████▍   | 59/92 [00:18<00:04,  6.85topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  65%|██████▌   | 60/92 [00:19<00:04,  7.05topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 57: ✅ Label → halflife half life halflife source life series gameplay complex and high quality design and an engaging, entertaining
Topik 58: ✅ Label → vampire survivorslike vampire survivors survivors genre survivorslike themed vampire survivorslike vampires survivors genre survivorslike vampire


Labeling topics:  66%|██████▋   | 61/92 [00:19<00:04,  6.77topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  67%|██████▋   | 62/92 [00:19<00:04,  6.74topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 59: ✅ Label → doom 2016 doom eternal doom doom 12 well doom death doom doom doom 12 doom doom doom doom doom
Topik 60: ✅ Label → gooddddd back gooddddd back come come back to get those wannabe shadack


Labeling topics:  68%|██████▊   | 63/92 [00:19<00:04,  6.39topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  70%|██████▉   | 64/92 [00:19<00:04,  6.51topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 61: ✅ Label → zombie survival kill zombies best zombie zombies zombie-fall zombie-survival zombie-survival zombie
Topik 62: ✅ Label → resident evil evil resident evil remake re2 remake survival horror movie with great ending


Labeling topics:  71%|███████   | 65/92 [00:19<00:04,  6.27topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  72%|███████▏  | 66/92 [00:20<00:04,  6.47topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 63: ✅ Label → dark souls soulslike souls souls though two souls in a single path. Each time another soul or another
Topik 64: ✅ Label → brasil simulator simulator brazil simulator bolivia venezuela simulator serbia simulator simian sim


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  74%|███████▍  | 68/92 [00:20<00:03,  6.95topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 65: ✅ Label → slay spire slay spirelike spire spire like spire clone spire like sp
Topik 66: ✅ Label → rpg gameplay one gameplay pokemon included gameplay









What's


Labeling topics:  75%|███████▌  | 69/92 [00:20<00:03,  6.48topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 67: ✅ Label → really excellent excellent excellent excellent ass little ass-sized. This is the 3rd time you get such
Topik 68: ✅ Label → tables pinball pinball tables pinball fx pinball deluxe tables pinball table table


Labeling topics:  77%|███████▋  | 71/92 [00:20<00:02,  7.14topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  78%|███████▊  | 72/92 [00:20<00:02,  7.53topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 69: ✅ Label → best fallout fallout best love fallout fallout new first fallout fallout new first fallout new first fallout new first fallout
Topik 70: ✅ Label → sherlocks sherlock buy sherlock love detective sherlock holmes have been on sale since December


Labeling topics:  79%|███████▉  | 73/92 [00:20<00:02,  7.29topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  82%|████████▏ | 75/92 [00:21<00:01,  9.12topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 71: ✅ Label → simply yes wait wait wow simply nothing say god yeah look it's a lot... then its my
Topik 72: ✅ Label → simplified diablo diablo diablo diablo better diablo best arpgistuso
Topik 73: ✅ Label → best vr vr premise vr flight vr experience vr environment vr simulation vr
Topik 74: ❌ Error → Input length of input_ids is 20, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Labeling topics:  84%|████████▎ | 77/92 [00:21<00:01, 10.28topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 75: ✅ Label → totally recommend yes recommend recommend enjoy def worth worth totally on to do at 5th level, with
Topik 76: ✅ Label → robes nothing three robes new robes robes cosmetic pack is a great addition with it a more natural option


Labeling topics:  86%|████████▌ | 79/92 [00:21<00:01,  8.93topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 77: ✅ Label → kk gnorp kk gnorp key hunting kk gnorp bp kk


Labeling topics:  87%|████████▋ | 80/92 [00:22<00:02,  5.34topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  88%|████████▊ | 81/92 [00:22<00:02,  5.43topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 78: ✅ Label → nice nice nice    ____________________________ ____________________________
Posted on: Friday 02 2012
Topik 79: ✅ Label → worth buy buy worth totally worth worth money worth nothing for a month? I have only bought three of


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  90%|█████████ | 83/92 [00:22<00:01,  6.23topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 80: ✅ Label → assassins creed ezios story creed odyssey assasins creed ezio story creed
Topik 81: ✅ Label → thumb thumb thumb loads thumb thumb loads funny thumb to thumb in front of eye while keeping your fingers


Labeling topics:  91%|█████████▏| 84/92 [00:22<00:01,  6.35topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  92%|█████████▏| 85/92 [00:22<00:01,  6.50topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 82: ✅ Label → nope nah maybe nope maybe nah maybe nah maybe nah yes.
Topik 83: ✅ Label → thrill cheap thrill rather addicting friends boring thrill price, i will be disappointed.


Labeling topics:  93%|█████████▎| 86/92 [00:22<00:00,  6.16topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  95%|█████████▍| 87/92 [00:23<00:00,  6.50topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 84: ✅ Label → im addicted addicted actually addicted addiction im addiction to heroin addiction is a major issue affecting thousands because they also
Topik 85: ✅ Label → vraiment ce jeu ce que que je un jeu et jeu et voire ne
Topik 86: ❌ Error → Input length of input_ids is 20, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Labeling topics:  97%|█████████▋| 89/92 [00:23<00:00,  8.28topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics:  98%|█████████▊| 90/92 [00:23<00:00,  7.74topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Topik 87: ✅ Label → survival genre ark survival survival world survival survival crafting, survival and survival to explore the universe of
Topik 88: ✅ Label → neat neat neat wow smart neat wow story neat nice neat cute nice cool just wow good stuff super


Labeling topics:  99%|█████████▉| 91/92 [00:23<00:00,  7.49topic/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Labeling topics: 100%|██████████| 92/92 [00:23<00:00,  3.88topic/s]

Topik 89: ✅ Label → quick platformer adventure funloving characters journey dreamfall small adventure RPG exploring a unique world that is sure
Topik 90: ✅ Label → made multiplayer singleplayer multiplayer enjoyable friends coop one-on-one.





✅ Label topik berhasil disimpan ke dalam model.


In [4]:
# Cell 2: Assign topik ke setiap game
import pandas as pd
from tqdm import tqdm
import pickle

# Muat ulang model dan data jika perlu
with open("topic_model.pkl", "rb") as f:
    topic_model = pickle.load(f)

df = pd.read_csv("Cleaned_Reviews.csv")
documents = df["Cleaned_Review"].astype(str).tolist()

# Ambil hasil transformasi dari dokumen
topics = topic_model.transform(documents)[0]  # [0] = assigned topics

# Ambil label yang sudah diset
topic_labels = topic_model.topic_labels_


assigned_labels = []
for doc_id in tqdm(range(len(documents)), desc="Assigning top topics", unit="doc"):
    topic_id = topics[doc_id]  # ini bisa int atau -1

    if isinstance(topic_id, int) and topic_id in topic_labels:
        assigned_labels.append(topic_labels[topic_id])
    else:
        assigned_labels.append("Unknown")


df["Topic_Label"] = assigned_labels
df_top_topics = df[["Game", "Topic_Label"]]  

df_top_topics.to_csv("Top_Topics.csv", index=False)
print("📁 File 'Top_Topics.csv' berhasil disimpan.")


Batches:   0%|          | 0/364 [00:00<?, ?it/s]

Assigning top topics: 100%|██████████| 11647/11647 [00:00<00:00, 3244408.49doc/s]


📁 File 'Top_Topics.csv' berhasil disimpan.


In [7]:
pip install gensim

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [8]:
import pickle
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel

# Load kembali objek dari Cell 1
with open("topic_model.pkl", "rb") as f:
    topic_model = pickle.load(f)

with open("tokenized_docs.pkl", "rb") as f:
    tokenized_docs = pickle.load(f)

# Ambil topik dari BERTopic
topics_words = topic_model.get_topics()

# Buat kamus dan corpus dari tokenized_docs
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# Susun daftar topik (hanya kata yang ada di kamus)
topic_list = []
for tid, words in topics_words.items():
    if tid == -1:
        continue
    topic_tokens = [word for word, _ in words if word in dictionary.token2id]
    if topic_tokens:
        topic_list.append(topic_tokens)

# Hitung coherence score jika ada topik yang valid
if topic_list:
    coherence_model = CoherenceModel(
        topics=topic_list,
        texts=tokenized_docs,
        dictionary=dictionary,
        coherence='c_v'
    )
    coherence_score = coherence_model.get_coherence()
    print(f"\n🧠 Coherence Score (c_v): {coherence_score:.4f}")
else:
    print("❌ Tidak ada topik valid untuk evaluasi coherence.")


Python(55786) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Python(55788) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Python(55789) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadloc


🧠 Coherence Score (c_v): 0.6394


# Labeling with LLM

In [8]:
import requests
import time
from tqdm import tqdm
import pickle
import pandas as pd

# Muat model BERTopic
with open("topic_model.pkl", "rb") as f:
    topic_model = pickle.load(f)

# URL dan token untuk Hugging Face Inference API
API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta"
HF_TOKEN = "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"

headers = {
    "Authorization": f"Bearer {HF_TOKEN}",
    "Content-Type": "application/json"
}

# Fungsi untuk mengirim prompt ke LLM
def query_llm(prompt):
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": 30,
            "temperature": 0.3,
            "do_sample": True
        }
    }
    response = requests.post(API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json()[0]["generated_text"]
    else:
        print("⚠️ Error:", response.status_code, response.text)
        return None

# Labeling semua topik
topics_dict = topic_model.get_topics()
topic_labels = {}
csv_data = []

for topic_id, words in tqdm(topics_dict.items(), desc="Labeling topics"):
    if topic_id == -1:
        continue  # Skip outlier topic
    
    # Ambil 5 kata kunci teratas
    top_words = [word for word, _ in words[:5]]
    keywords = ", ".join(top_words)
    
    prompt = (
        f"Buatlah label pendek (maksimal 5 kata) untuk topik berikut yang terdiri dari kata-kata: "
        f"{keywords}.\nLabel:"
    )
    
    result = query_llm(prompt)
    if result:
        label = result.split("Label:")[-1].strip().split("\n")[0]
        topic_labels[topic_id] = label
        print(f"Topik {topic_id} → {label}")
    else:
        label = f"Topik {topic_id}"
        topic_labels[topic_id] = label
    
    csv_data.append({
        "Topik": topic_id,
        "Keyword": keywords,
        "Label Topik": label
    })

    time.sleep(3)
# Setel label ke model
topic_model.set_topic_labels(topic_labels)

# Simpan ulang model dengan label
with open("topic_model_labeled.pkl", "wb") as f:
    pickle.dump(topic_model, f)

# Simpan hasil labeling ke CSV
df_topic_labels = pd.DataFrame(csv_data)
df_topic_labels.to_csv("Topic_Labels.csv", index=False)

print("✅ Label topik selesai dan disimpan ke topic_model_labeled.pkl dan Topic_Labels.csv")


Labeling topics:   0%|          | 0/92 [00:00<?, ?it/s]

Topik 0 → Fast-paced Action


Labeling topics:   2%|▏         | 2/92 [00:06<04:31,  3.01s/it]

Topik 1 → Brief


Labeling topics:   3%|▎         | 3/92 [00:11<06:11,  4.18s/it]

Topik 2 → DLC


Labeling topics:   4%|▍         | 4/92 [00:17<07:04,  4.83s/it]

Topik 3 → Puzzle Portal Adventure


Labeling topics:   5%|▌         | 5/92 [00:23<07:29,  5.17s/it]

Topik 4 → CrashFix


Labeling topics:   7%|▋         | 6/92 [00:30<08:15,  5.76s/it]

Topik 5 → Quick Fix


Labeling topics:   8%|▊         | 7/92 [00:36<08:13,  5.80s/it]

Topik 6 → Horror Survival Silent Hill Psychological


Labeling topics:   9%|▊         | 8/92 [00:42<08:10,  5.84s/it]

Topik 7 → Hearthstone Deckbuilders Decks Deckbuilder.


Labeling topics:  10%|▉         | 9/92 [00:48<08:07,  5.87s/it]

Topik 8 → Cancer Disorders Diseases Disease Personality


Labeling topics:  11%|█         | 10/92 [00:54<08:03,  5.89s/it]

Topik 9 → 1010 Amazing Would Best 1010


Labeling topics:  12%|█▏        | 11/92 [01:00<07:55,  5.87s/it]

Topik 10 → Quick Play English Easy Fun Single


Labeling topics:  13%|█▎        | 12/92 [01:05<07:47,  5.84s/it]

Topik 11 → Urban Planner


Labeling topics:  14%|█▍        | 13/92 [01:11<07:43,  5.86s/it]

Topik 12 → like roguelike with turn-


Labeling topics:  15%|█▌        | 14/92 [01:17<07:38,  5.88s/it]

Topik 13 → Remastered for


Labeling topics:  16%|█▋        | 15/92 [01:23<07:38,  5.96s/it]

Topik 14 → Short and Sweet


Labeling topics:  17%|█▋        | 16/92 [01:29<07:28,  5.91s/it]

Topik 15 → BEST COOP EXPERIENCE


Labeling topics:  18%|█▊        | 17/92 [01:37<08:10,  6.54s/it]

Topik 16 → _________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________


Labeling topics:  20%|█▉        | 18/92 [01:43<07:44,  6.28s/it]

Topik 17 → Boardgame Bliss


Labeling topics:  21%|██        | 19/92 [01:48<07:25,  6.10s/it]

Topik 18 → Amazing5


Labeling topics:  22%|██▏       | 20/92 [01:54<07:14,  6.03s/it]

Topik 19 → Quick Kids


Labeling topics:  23%|██▎       | 21/92 [02:00<07:04,  5.99s/it]

Topik 20 → Cyberpunk Samurai War


Labeling topics:  24%|██▍       | 22/92 [02:06<06:59,  5.99s/it]

Topik 21 → Quick Laughs


Labeling topics:  25%|██▌       | 23/92 [02:12<06:49,  5.94s/it]

Topik 22 → Meh Basically


Labeling topics:  26%|██▌       | 24/92 [02:18<06:50,  6.03s/it]

Topik 23 → 


Labeling topics:  27%|██▋       | 25/92 [02:24<06:39,  5.96s/it]

Topik 24 → Gameplay Novel Recommend


Labeling topics:  28%|██▊       | 26/92 [02:30<06:32,  5.94s/it]

Topik 25 → Narrative Adventures


Labeling topics:  29%|██▉       | 27/92 [02:36<06:24,  5.92s/it]

Topik 26 → Tactical Warfare


Labeling topics:  30%|███       | 28/92 [02:42<06:24,  6.00s/it]

Topik 27 → Highly Recomm


Labeling topics:  32%|███▏      | 29/92 [02:48<06:15,  5.96s/it]

Topik 28 → Quick & Easy


Labeling topics:  33%|███▎      | 30/92 [02:54<06:07,  5.92s/it]

Topik 29 → Perfectly Funny


Labeling topics:  34%|███▎      | 31/92 [03:00<06:01,  5.93s/it]

Topik 30 → EAVL Dev Servers


Labeling topics:  35%|███▍      | 32/92 [03:05<05:53,  5.88s/it]

Topik 31 → Realistic Racing Motorsport Mario Kart Much Racing


Labeling topics:  36%|███▌      | 33/92 [03:11<05:46,  5.87s/it]

Topik 32 → GTA Action


Labeling topics:  37%|███▋      | 34/92 [03:17<05:46,  5.97s/it]

Topik 33 → Remastered Edition.


Labeling topics:  38%|███▊      | 35/92 [03:23<05:35,  5.89s/it]

Topik 34 → excellent port


Labeling topics:  39%|███▉      | 36/92 [03:29<05:26,  5.84s/it]

Topik 35 → Easy Pay Wins Free


Labeling topics:  40%|████      | 37/92 [03:35<05:25,  5.91s/it]

Topik 36 → Quick Hits


Labeling topics:  41%|████▏     | 38/92 [03:41<05:17,  5.88s/it]

Topik 37 → 10x10x10 ONE


Labeling topics:  42%|████▏     | 39/92 [03:47<05:12,  5.90s/it]

Topik 38 → Indie Price Crisis


Labeling topics:  43%|████▎     | 40/92 [03:53<05:05,  5.87s/it]

Topik 39 → Smart Construction


Labeling topics:  45%|████▍     | 41/92 [03:58<04:59,  5.87s/it]

Topik 40 → shortyes


Labeling topics:  46%|████▌     | 42/92 [04:04<04:50,  5.82s/it]

Topik 41 → Incredible Amazing


Labeling topics:  47%|████▋     | 43/92 [04:10<04:44,  5.81s/it]

Topik 42 → Mini Controllers


Labeling topics:  48%|████▊     | 44/92 [04:16<04:36,  5.77s/it]

Topik 43 → Disappointing Music for Love


Labeling topics:  49%|████▉     | 45/92 [04:21<04:30,  5.75s/it]

Topik 44 → Action Adventure


Labeling topics:  50%|█████     | 46/92 [04:27<04:23,  5.73s/it]

Topik 45 → Quick Hours


Labeling topics:  51%|█████     | 47/92 [04:33<04:19,  5.76s/it]

Topik 46 → max payne 3


Labeling topics:  52%|█████▏    | 48/92 [04:39<04:13,  5.76s/it]

Topik 47 → 


Labeling topics:  53%|█████▎    | 49/92 [04:45<04:12,  5.88s/it]

Topik 48 → Scoreboard Function


Labeling topics:  54%|█████▍    | 50/92 [04:51<04:07,  5.88s/it]

Topik 49 → BPR


Labeling topics:  55%|█████▌    | 51/92 [04:57<04:01,  5.89s/it]

Topik 50 → Zombie World - Zombies Best - Best Zombies - Horror Zombie Massengemetzel - Best Zombie


Labeling topics:  57%|█████▋    | 52/92 [05:02<03:55,  5.89s/it]

Topik 51 → 


Labeling topics:  58%|█████▊    | 53/92 [05:08<03:48,  5.86s/it]

Topik 52 → 


Labeling topics:  59%|█████▊    | 54/92 [05:14<03:43,  5.88s/it]

Topik 53 → Quick Fix


Labeling topics:  60%|█████▉    | 55/92 [05:20<03:37,  5.89s/it]

Topik 54 → BestT


Labeling topics:  61%|██████    | 56/92 [05:27<03:38,  6.07s/it]

Topik 55 → Kingdom Hearts Re:Chain of Memories - Infinitely Replayable Ride Life Enjoy Repetitive


Labeling topics:  62%|██████▏   | 57/92 [05:33<03:36,  6.18s/it]

Topik 56 → Soundtrack Essentials


Labeling topics:  63%|██████▎   | 58/92 [05:39<03:25,  6.03s/it]

Topik 57 → [Insert label here]


Labeling topics:  64%|██████▍   | 59/92 [05:44<03:16,  5.95s/it]

Topik 58 → Vampire Survivors-like Survivors Genre Themed Vampire SurvivorsLike


Labeling topics:  65%|██████▌   | 60/92 [05:50<03:09,  5.92s/it]

Topik 59 → Doom 2016 - Doom Eternal - Doom - Doom 12 - Well Doom


Labeling topics:  66%|██████▋   | 61/92 [05:56<03:02,  5.88s/it]

Topik 60 → Quick Return


Labeling topics:  67%|██████▋   | 62/92 [06:02<02:56,  5.88s/it]

Topik 61 → Zombie Slayer


Labeling topics:  68%|██████▊   | 63/92 [06:08<02:49,  5.86s/it]

Topik 62 → Horror Resident Evil


Labeling topics:  70%|██████▉   | 64/92 [06:13<02:42,  5.82s/it]

Topik 63 → Mini Souls


Labeling topics:  71%|███████   | 65/92 [06:19<02:36,  5.79s/it]

Topik 64 → Brazil Simulator


Labeling topics:  72%|███████▏  | 66/92 [06:25<02:31,  5.81s/it]

Topik 65 → Slay Spire Clone


Labeling topics:  73%|███████▎  | 67/92 [06:31<02:25,  5.81s/it]

Topik 66 → RPG, Gameplay, One Game


Labeling topics:  74%|███████▍  | 68/92 [06:37<02:19,  5.83s/it]

Topik 67 → Superb Ass


Labeling topics:  75%|███████▌  | 69/92 [06:43<02:15,  5.88s/it]

Topik 68 → Pinball Deluxe Tables


Labeling topics:  76%|███████▌  | 70/92 [06:49<02:08,  5.86s/it]

Topik 69 → Fallout New Vegas


Labeling topics:  77%|███████▋  | 71/92 [06:54<02:03,  5.86s/it]

Topik 70 → Sherlock Essentials


Labeling topics:  78%|███████▊  | 72/92 [07:00<01:56,  5.81s/it]

Topik 71 → God Yeah


Labeling topics:  79%|███████▉  | 73/92 [07:06<01:51,  5.85s/it]

Topik 72 → Di


Labeling topics:  80%|████████  | 74/92 [07:12<01:44,  5.83s/it]

Topik 73 → BEST VR FLIGHT EXPERIENCE


Labeling topics:  82%|████████▏ | 75/92 [07:18<01:39,  5.86s/it]

Topik 74 → Batman Arkham Knight Arkham City


Labeling topics:  83%|████████▎ | 76/92 [07:24<01:33,  5.84s/it]

Topik 75 → 1. Totally recommend - TotallyRecommend


Labeling topics:  84%|████████▎ | 77/92 [07:29<01:27,  5.85s/it]

Topik 76 → Quick Fix


Labeling topics:  85%|████████▍ | 78/92 [07:35<01:21,  5.85s/it]

Topik 77 → Quick Key Hunting Gnorp KK


Labeling topics:  86%|████████▌ | 79/92 [07:41<01:15,  5.83s/it]

Topik 78 → _______________________


Labeling topics:  87%|████████▋ | 80/92 [07:47<01:09,  5.81s/it]

Topik 79 → 


Labeling topics:  88%|████████▊ | 81/92 [07:53<01:04,  5.82s/it]

Topik 80 → Assassin's Creed Ezio's Story: Creed Odyssey


Labeling topics:  89%|████████▉ | 82/92 [07:58<00:57,  5.78s/it]

Topik 81 → Quick Thumb


Labeling topics:  90%|█████████ | 83/92 [08:04<00:52,  5.78s/it]

Topik 82 → Quick Poll


Labeling topics:  91%|█████████▏| 84/92 [08:10<00:46,  5.78s/it]

Topik 83 → Thrill Rush


Labeling topics:  92%|█████████▏| 85/92 [08:16<00:40,  5.82s/it]

Topik 84 → psychological


Labeling topics:  93%|█████████▎| 86/92 [08:22<00:35,  5.88s/it]

Topik 85 → 1. Vraiment ce jeu que je vais vous montrer est incroyablement amusant!


Labeling topics:  95%|█████████▍| 87/92 [08:28<00:29,  5.84s/it]

Topik 86 → Metroid


Labeling topics:  96%|█████████▌| 88/92 [08:33<00:23,  5.84s/it]

Topik 87 → Survival Crafting World


Labeling topics:  97%|█████████▋| 89/92 [08:39<00:17,  5.84s/it]

Topik 88 → Neat Neat


Labeling topics:  98%|█████████▊| 90/92 [08:45<00:11,  5.89s/it]

Topik 89 → Speedy Platformer, Thrilling Adventure, Charming Characters, Dreamfall Journey, Compact Adventure.


Labeling topics:  99%|█████████▉| 91/92 [08:51<00:05,  5.95s/it]

Topik 90 → Co-op One


Labeling topics: 100%|██████████| 92/92 [08:57<00:00,  5.85s/it]


✅ Label topik selesai dan disimpan ke topic_model_labeled.pkl dan Topic_Labels.csv


In [None]:
import pandas as pd
from tqdm import tqdm
import pickle

# Muat ulang model dan data jika perlu
with open("topic_model.pkl", "rb") as f:
    topic_model = pickle.load(f)

df = pd.read_csv("Cleaned_Reviews.csv")
documents = df["Cleaned_Review"].astype(str).tolist()

# Transformasi dokumen ke topik
topics = topic_model.transform(documents)[0]
topic_labels = topic_model.topic_labels_

# Assign label topik
assigned_labels = []
for doc_id in tqdm(range(len(documents)), desc="Assigning topik"):
    topic_id = topics[doc_id]
    if isinstance(topic_id, int) and topic_id in topic_labels:
        assigned_labels.append(topic_labels[topic_id])
    else:
        assigned_labels.append("Unknown")

# Tambahkan ke DataFrame dan ekspor
df["Topic_Label"] = assigned_labels
df_top_topics = df[["Game", "Topic_Label"]]  # Pastikan 'Game' adalah nama kolom yang sesuai
df_top_topics.to_csv("Top_Topics.csv", index=False)

print("📁 File 'Top_Topics.csv' berhasil disimpan.")


Batches:   0%|          | 0/364 [00:00<?, ?it/s]