In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from collections import Counter
import nltk
from nltk.corpus import stopwords

# Unduh stopwords Indo (pertama kali aja)
nltk.download('stopwords')
indo_stopwords = stopwords.words('indonesian')

# === Load dataset ===
df_main = pd.read_csv("labeled_comments_ultimate.csv")
text_col = "text" if "text" in df_main.columns else df_main.columns[0]
df_main = df_main[[text_col]].rename(columns={text_col: "komentar"})

# === Tambahan data baru ===
df_extra = pd.DataFrame({
    "komentar": [
        "main di web biru lagi rame banget",
        "aku baru beli hp di shopee",
        "spin scatter muncul tiga kali",
        "tadi malam hoki banget dapet jackpot",
        "modal receh tapi jadi banyak",
        "videonya lucu banget parah ðŸ˜‚",
        "situs itu emang rame tiap malam",
        "saldo digital tiba-tiba nambah",
        "kualitas suaranya keren banget",
        "berita hari ini tentang slot lagi viral"
    ]
})
df_all = pd.concat([df_main, df_extra], ignore_index=True)

# === TF-IDF Vectorizer ===
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    stop_words=indo_stopwords
)
X = vectorizer.fit_transform(df_all["komentar"].astype(str))

# === KMeans Clustering ===
k = 3
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df_all["cluster"] = kmeans.fit_predict(X)

# === Evaluasi dan Analisis ===
score = silhouette_score(X, df_all["cluster"])
print(f"Silhouette Score: {score:.3f}")

print("\n=== Analisis Tiap Cluster ===")
for i in range(k):
    subset = df_all[df_all["cluster"] == i]
    keywords = Counter(" ".join(subset["komentar"]).lower().split()).most_common(10)
    print(f"\nCluster {i} (jumlah: {len(subset)})")
    print("Top keywords:", keywords)
    print("Contoh:", subset["komentar"].sample(min(5, len(subset))).tolist())

df_all.to_csv("hasil_clustering_judol_full.csv", index=False)
print("\nDisimpan ke hasil_clustering_judol_full.csv")


[nltk_data] Downloading package stopwords to /home/wtf/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Silhouette Score: 0.013

=== Analisis Tiap Cluster ===

Cluster 0 (jumlah: 136)
Top keywords: [('banget', 140), ('se', 31), ('yang', 17), ('kyt4d', 17), ('ini', 17), ('pulauwin', 17), ('lucu', 15), ('suka', 15), ('sama', 14), ('ambil4d', 14)]
Contoh: ['sumpah pak jarwo lucu banget njiirtt', 'aduh ken banget', 'roma se rem banget dah', 'bener bener rejeki nomplok money mouth face tkp62 juara banget', 'mantap seru banget nonton channel probet 855 minat ramee']

Cluster 1 (jumlah: 1676)
Top keywords: [('se', 445), ('yang', 144), ('ini', 135), ('yg', 129), ('ada', 122), ('itu', 120), ('kucing', 118), ('gak', 116), ('ya', 98), ('jok', 88)]
Contoh: ['my mom never do that for me', 'ada jaitan a gtu rawan bocor air saat musim ujan', 'yapp ini jelas paling realistis gabisa debat lagi perfect bro gas google se karang ambil4d', 'itu captionnya gmna dah gwe bingung yg ngedit slah tik atau gmna udah gwe ulang2 baca msih bingung', 'kirain pegang roti td awal']

Cluster 2 (jumlah: 202)
Top keywords: 