In [1]:
# Import necessary libraries
import pandas as pd
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from sentence_transformers import SentenceTransformer

# Load the dataset
df = pd.read_csv("Cleaned_Reviews.csv")

# Validate columns
if 'Cleaned_Review' not in df.columns:
    raise ValueError("Column 'Cleaned_Review' not found.")

# Extract documents for modeling
docs = df['Cleaned_Review'].astype(str).tolist()

# Define seed topics
seed_topic_list = [
    ["mechanics", "combat", "controls", "interaction"],
    ["bugs", "lag", "fps", "crash", "stability"],
    ["score", "leaderboard", "points", "ranks", "levels"],
    ["multiplayer", "co", "op", "community", "online"],
    ["character", "npc", "customization", "design"],
    ["soundtrack", "bgm", "audio", "music", "melody"],
    ["plot", "narrative", "lore", "ending", "storyline"]
]

# Create a vectorizer
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

# Set UMAP model for 3D projection
umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.1, metric='cosine')

# Choose SentenceTransformer model for embedding
embedding_model = SentenceTransformer("paraphrase-MPNet-base-v2")

# Fit the BERTopic model with custom UMAP and embedding model
topic_model = BERTopic(
    umap_model=umap_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    language="english",
    seed_topic_list=seed_topic_list,
    calculate_probabilities=True,
    verbose=True
)

# Fit the model and get topics
topics, _ = topic_model.fit_transform(docs)

# Check the first few topics to ensure it's iterable and correct
print(topics[:10])  # Display the first 10 topics

# Preprocess Documents for Coherence Evaluation
documents_per_topic = pd.DataFrame({"Topic": topics, "Document": docs})
documents_per_topic = documents_per_topic.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic['Document'].values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
            for topic in range(len(set(topics))-1)]

# Evaluate Coherence
coherence_model = CoherenceModel(topics=topic_words, 
                                  texts=tokens, 
                                  corpus=corpus,
                                  dictionary=dictionary, 
                                  coherence='c_v')
coherence = coherence_model.get_coherence()

print("Coherence Score:", coherence)

# Visualize topics with intertopic distance map
topic_model.visualize_topics()


Couldn't import dot_parser, loading of dot files will not be possible.


Python(45312) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
2025-05-29 23:07:51,453 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/364 [00:00<?, ?it/s]

2025-05-29 23:11:25,377 - BERTopic - Embedding - Completed ✓
2025-05-29 23:11:25,385 - BERTopic - Guided - Find embeddings highly related to seeded topics.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-29 23:11:25,905 - BERTopic - Guided - Completed ✓
2025-05-29 23:11:25,906 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2025-05-29 23:11:38,814 - BERTopic - Dimensionality - Completed ✓
2025-05-29 23:11:38,815 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-29 23:11:47,373 - BERTopic - Cluster - Completed ✓
2025-05-29 23:11:47,383 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-05-29 23:11:48,630 - BERTopic - Representation - Completed ✓


[88, 33, -1, 113, 10, -1, 4, 26, -1, 95]


Python(45476) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Python(45478) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Python(45479) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadloc

Coherence Score: 0.5738132178534913


In [None]:
topic_model.visualize_barchart()

In [None]:
# Import necessary libraries
import pandas as pd
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from sentence_transformers import SentenceTransformer

# Load the dataset
df = pd.read_csv("Cleaned_Reviews.csv")

# Validate columns
if 'Cleaned_Review' not in df.columns:
    raise ValueError("Column 'Cleaned_Review' not found.")

# Extract documents for modeling
docs = df['Cleaned_Review'].astype(str).tolist()

# Define seed topics
seed_topic_list = [
    ["mechanics", "combat", "controls", "interaction"],
    ["bugs", "lag", "fps", "crash", "stability"],
    ["score", "leaderboard", "points", "ranks", "levels"],
    ["multiplayer", "co", "op", "community", "online"],
    ["character", "npc", "customization", "design"],
    ["soundtrack", "bgm", "audio", "music", "melody"],
    ["plot", "narrative", "lore", "ending", "storyline"]
]

# Create a vectorizer
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

# Set UMAP model for 3D projection
umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.1, metric='cosine')

# Choose SentenceTransformer model for embedding
embedding_model = SentenceTransformer("paraphrase-MPNet-base-v2")

# Fit the BERTopic model with custom UMAP and embedding model
topic_model = BERTopic(
    umap_model=umap_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    language="english",
    seed_topic_list=seed_topic_list,
    calculate_probabilities=True,
    verbose=True
)

# Fit the model and get topics
topics, _ = topic_model.fit_transform(docs)

# Check the first few topics to ensure it's iterable and correct
print(topics[:10])  # Display the first 10 topics

# Preprocess Documents for Coherence Evaluation
documents_per_topic = pd.DataFrame({"Topic": topics, "Document": docs})
documents_per_topic = documents_per_topic.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic['Document'].values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
            for topic in range(len(set(topics))-1)]

# Evaluate Coherence
coherence_model = CoherenceModel(topics=topic_words, 
                                  texts=tokens, 
                                  corpus=corpus,
                                  dictionary=dictionary, 
                                  coherence='c_v')
coherence = coherence_model.get_coherence()

print("Coherence Score:", coherence)

# Reduce topics if there are overlaps
topic_model.reduce_topics(docs, nr_topics=10)  # Reduce to 10 topics to avoid overlap

# Visualize topics with intertopic distance map
topic_model.visualize_topics()


Couldn't import dot_parser, loading of dot files will not be possible.


2025-05-29 22:56:45,709 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/364 [00:00<?, ?it/s]

In [18]:
# Import necessary libraries
import pandas as pd
import re
import glob
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from umap import UMAP
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

# Kamus normalisasi singkat
normalization_dict = {
    "u": "you", "r": "are", "ur": "your", "pls": "please", "plz": "please",
    "thx": "thanks", "ty": "thank you", "btw": "by the way", "lol": "laughing out loud",
    "lmao": "laughing my ass off", "omg": "oh my god", "wtf": "what the fuck",
    "idk": "i do not know", "imo": "in my opinion", "imho": "in my humble opinion",
    "gg": "good game", "ez": "easy", "nerf": "weaken", "buff": "strengthen",
    "noob": "newbie", "grind": "repetitive task", "op": "overpowered",
    "laggy": "slow connection", "gitgud": "get good", "f2p": "free to play",
    "p2w": "pay to win", "dlc": "downloadable content", "npc": "non player character",
    "xp": "experience points", "lvl": "level", "rng": "random number generator",
    "afk": "away from keyboard", "af": "as fuck", "rp": "roleplay",
    "ive": "i have", "tl": "too long", "dr": "did not read"
}

# Deteksi konteks fps
def disambiguate_fps(text):
    if re.search(r"\b(fps)\b", text.lower()):
        if re.search(r"(graphics|frame|lag|performance|per second)", text.lower()):
            text = re.sub(r"\bfps\b", "frames per second", text, flags=re.IGNORECASE)
        elif re.search(r"(shooter|gun|weapon|enemy|combat|first person)", text.lower()):
            text = re.sub(r"\bfps\b", "first person shooter", text, flags=re.IGNORECASE)
    return text

# Fungsi normalisasi + pembersihan ringan
def preprocess_review(text):
    if not isinstance(text, str):
        text = ""
    text = text.lower()  # Lowercase ringan
    words = text.split()
    words = [normalization_dict.get(word, word) for word in words]
    text = " ".join(words)
    text = disambiguate_fps(text)
    text = re.sub(r"http\S+", "", text)               # hapus url
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)        # hapus karakter aneh
    text = re.sub(r"\s+", " ", text).strip()          # hilangkan spasi berlebih
    return text

# Ambil semua file CSV dari dua folder
csv_files_1 = glob.glob("reviews_1/*.csv")
csv_files_2 = glob.glob("reviews_2/*.csv")
csv_files = csv_files_1 + csv_files_2

# Gabungkan semua CSV jadi satu DataFrame
df_list = [pd.read_csv(file) for file in csv_files]
df = pd.concat(df_list, ignore_index=True)

# Tampilkan jumlah total baris
print(f"Total data: {len(df)} baris")

# Hitung jumlah UserID unik
jumlah_userid = df['UserID'].nunique()
print(f"Jumlah UserID unik: {jumlah_userid}")

# Memproses review
tqdm.pandas()  # Mengaktifkan tqdm untuk progress bar
df['Cleaned_Review'] = df['Review'].progress_apply(preprocess_review)

# Tampilkan hasil pembersihan review (contoh)
print(df[['UserID', 'Review', 'Cleaned_Review']].head())

# Menyimpan DataFrame yang telah diproses ke dalam file CSV baru
df.to_csv("Cleaned_Reviews.csv", index=False)

print("File CSV dengan review yang telah dibersihkan berhasil disimpan sebagai 'Cleaned_Reviews.csv'.")

# Seed Topics yang lebih mendetail
seed_topic_list = [
    ["gameplay", "mechanics", "combat system", "interaction", "controls", "gameplay features", "game flow"],
    ["bugs", "lag", "fps", "frame rate", "crashes", "performance issues", "optimization", "game stability"],
    ["score", "leaderboard", "ranking", "points system", "levels", "scoreboard", "achievements"],
    ["multiplayer", "co-op", "community", "online play", "player interaction", "multiplayer mode", "social features"],
    ["character design", "npc", "character customization", "avatars", "design features", "customizable characters"],
    ["soundtrack", "bgm", "audio design", "background music", "sound effects", "melody", "soundtrack quality"],
    ["plot", "narrative", "storyline", "story arc", "game plot", "ending", "character development", "game narrative"]
]

# Stop words kustom dalam frozenset
stop_words_frozenset = frozenset({
    'serious', 'via', 'twelve', 'thick', 'within', 'else', 'enough', 'former', 'otherwise', 'wherein', 
    'where', 'about', 'an', 'one', 'how', 'whatever', 'thereupon', 'across', 'down', 'out', 'can', 
    'nevertheless', 'nothing', 'you', 'both', 'are', 'get', 'yourselves', 'it', 'has', 'ten', 'de', 
    'full', 'rather', 'system', 'though', 'the', 'toward', 'around', 'a', 'her', 'because', 'above', 
    'been', 'somehow', 'would', 'latterly', 'three', 'with', 'be', 'detail', 'himself', 'of', 'indeed', 
    'name', 'mostly', 'now', 'me', 'somewhere', 'whenever', 'more', 'please', 'once', 'bottom', 'what', 
    'this', 'or', 'they', 'less', 'hereupon', 'throughout', 'herein', 'on', 'whom', 'some', 'no', 'so', 
    'latter', 'amoungst', 'namely', 'find', 'through', 'hasnt', 'done', 'interest', 'elsewhere', 'together', 
    'his', 'again', 'any', 'even', 'few', 'thereafter', 'first', 'she', 'between', 'wherever', 'yours', 
    'amongst', 'inc', 'much', 'than', 'when', 'here', 'whether', 'besides', 'co', 'forty', 'is', 'something', 
    'hundred', 'fire', 'nobody', 'we', 'six', 'made', 'among', 'such', 'sometimes', 'always', 'beforehand', 
    'thru', 'themselves', 'by', 'becomes', 'our', 'amount', 'will', 'must', 'why', 'mill', 'etc', 'only', 
    'below', 'front', 'seem', 'often', 'could', 'he', 'into', 'everyone', 'alone', 'although', 'ours', 'off', 
    'itself', 'whereupon', 'put', 'whither', 'last', 'someone', 'per', 'five', 'becoming', 'same', 'and', 
    'behind', 'meanwhile', 'if', 'that', 'to', 'fifteen', 'several', 'ever', 'were', 'neither', 'therein', 
    'cannot', 'very', 'at', 'whoever', 'moreover', 'their', 'being', 'ourselves', 'beyond', 'your', 'them', 
    'take', 'either', 'too', 'twenty', 'before', 'while', 'anyway', 'already', 'bill', 'whereby', 'should', 
    'con', 'upon', 'hereby', 'eight', 'whereas', 'myself', 'after', 'move', 'since', 'two', 'seeming', 
    'thereby', 'all', 'sometime', 'herself', 'many', 'almost', 'ltd', 'side', 'except', 'eleven', 'four', 
    'call', 'part', 'anywhere', 're', 'never', 'everywhere', 'afterwards', 'anyone', 'as', 'still', 'not', 
    'back', 'my', 'other', 'along', 'therefore', 'sixty', 'nor', 'have', 'who', 'during', 'hence', 'thin', 
    'least', 'may', 'hereafter', 'third', 'sincere', 'fill', 'mine', 'for', 'every', 'whose', 'nine', 'might', 
    'un', 'top', 'over', 'further', 'empty', 'however', 'nowhere', 'due', 'but', 'none', 'towards', 'then', 
    'yet', 'next', 'couldnt', 'describe', 'cant', 'fifty', 'was', 'those', 'its', 'ie', 'eg', 'give', 'him', 
    'without', 'also', 'seemed', 'found', 'am', 'anyhow', 'onto', 'seems', 'i', 'there', 'until', 'up', 
    'anything', 'under', 'well', 'each', 'had', 'whole', 'whereafter', 'in', 'own', 'see', 'most', 'thus', 
    'everything', 'go', 'these', 'beside', 'against', 'yourself', 'thence', 'others', 'became', 'become', 
    'perhaps', 'show', 'whence', 'another', 'cry', 'noone', 'do', 'from', 'hers', 'keep', 'formerly', 'which', 
    'us'
})

# Mengubah frozenset menjadi list
stop_words_list = list(stop_words_frozenset)

# Gunakan TfidfVectorizer dengan stop words kustom
vectorizer_model = TfidfVectorizer(ngram_range=(1, 2), stop_words=stop_words_list)

# Set UMAP model dengan pengaturan yang lebih ketat
umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.1, metric='cosine')

# Pilih model embedding yang lebih kuat untuk menangkap semantik
embedding_model = SentenceTransformer("paraphrase-MPNet-base-v2")

# Fit BERTopic model dengan UMAP dan embedding model yang disesuaikan
topic_model = BERTopic(
    umap_model=umap_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    language="english",
    seed_topic_list=seed_topic_list,
    calculate_probabilities=True,
    verbose=True
)

# Fit the model and get topics
topics, _ = topic_model.fit_transform(docs)

# Kurangi jumlah topik jika ditemukan tumpang tindih
topic_model.reduce_topics(docs, nr_topics=30)  # Mengurangi menjadi 10 topik

# Visualize topics with intertopic distance map
topic_model.visualize_topics()


Total data: 11647 baris
Jumlah UserID unik: 227


100%|██████████| 11647/11647 [00:00<00:00, 33111.39it/s]


              UserID                                             Review  \
0  76561199076918197                                 Better than Naruto   
1  76561199076918197        I am offensive and I find this game Indian.   
2  76561199076918197  1) Game breakin glitches. 2) Out dated physics...   
3  76561199076918197                                Too much of hentai.   
4  76561199076918197  Dude, say whatever you wanna say, but remember...   

                                      Cleaned_Review  
0                                 better than naruto  
1         i am offensive and i find this game indian  
2  1 game breakin glitches 2 out dated physics 3 ...  
3                                 too much of hentai  
4  dude say whatever you wanna say but remember i...  
File CSV dengan review yang telah dibersihkan berhasil disimpan sebagai 'Cleaned_Reviews.csv'.


2025-05-30 00:43:31,979 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/364 [00:00<?, ?it/s]

2025-05-30 00:46:37,358 - BERTopic - Embedding - Completed ✓
2025-05-30 00:46:37,364 - BERTopic - Guided - Find embeddings highly related to seeded topics.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-30 00:46:37,641 - BERTopic - Guided - Completed ✓
2025-05-30 00:46:37,642 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-30 00:46:41,684 - BERTopic - Dimensionality - Completed ✓
2025-05-30 00:46:41,688 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-30 00:46:48,331 - BERTopic - Cluster - Completed ✓
2025-05-30 00:46:48,347 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-05-30 00:46:49,492 - BERTopic - Representation - Completed ✓
2025-05-30 00:46:49,856 - BERTopic - Topic reduction - Reducing number of topics
2025-05-30 00:46:51,078 - BERTopic - Topic reduction - Reduced number of topics from 138 to 30


In [19]:
# Evaluate Coherence
coherence_model = CoherenceModel(topics=topic_words, 
                                  texts=tokens, 
                                  corpus=corpus,
                                  dictionary=dictionary, 
                                  coherence='c_v')
coherence = coherence_model.get_coherence()

print("Coherence Score:", coherence)

Python(50061) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Python(50063) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Python(50064) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadloc

Coherence Score: 0.5738132178534913


In [20]:
topic_model.visualize_barchart()

In [24]:
# Visualize all topics in a bar chart
topic_model.visualize_barchart(top_n_topics=30)  # Menampilkan hingga 20 topik

In [22]:
# Menampilkan review beserta topik yang dihasilkan
df['Topic'] = topics  # Menambahkan kolom topik pada dataframe

# Tampilkan beberapa contoh review dan topiknya
top_5_examples = df[['Review', 'Topic']].head(5)
print(top_5_examples)


                                              Review  Topic
0                                 Better than Naruto     78
1        I am offensive and I find this game Indian.     -1
2  1) Game breakin glitches. 2) Out dated physics...     -1
3                                Too much of hentai.     -1
4  Dude, say whatever you wanna say, but remember...      5
