In [None]:
from bertopic import BERTopic
from sklearn.metrics import silhouette_score
import pandas as pd
import numpy as np

In [None]:
# Data
df_cleaned = pd.read_csv(r'tweets-data\cleaned.csv')

#slicing
df_cleaned = df_cleaned.iloc[1500:,:]
df_cleaned

In [None]:
# Step 1: Preprocessing Data
# Mengganti NaN dengan string kosong
df_cleaned['clean_text'] = df_cleaned['clean_text'].fillna("")

In [None]:
# Step 2: Inisialisasi Model BERTopic
topic_model = BERTopic(language="indonesian", calculate_probabilities=True, verbose=True)

# Step 3: Latih Model pada Data
texts = df_cleaned['clean_text'].tolist()
topics, probs = topic_model.fit_transform(texts)

In [None]:
# Step 4: Evaluasi dengan Topic Coherence
topic_coherence = topic_model.get_topic_coherence()
print(f"\nCoherence Score untuk BERTopic: {np.mean(topic_coherence)}")

In [None]:
# Step 5: Evaluasi dengan Silhouette Score
# Menggunakan representasi vektor dari dokumen yang dihasilkan oleh BERTopic
embeddings = topic_model.transform(texts)

# Menghitung Silhouette Score untuk clustering
silhouette_avg = silhouette_score(embeddings, topics)
print(f"Silhouette Score untuk BERTopic: {silhouette_avg}")

In [None]:
# Step 6: Menampilkan Topik yang Ditemukan
topic_info = topic_model.get_topic_info()
print("\nInformasi Topik yang Ditemukan:")
print(topic_info)

In [None]:
# Step 7: Visualisasi Hierarki Topik
topic_model.visualize_hierarchy()

In [None]:
# Step 8: Visualisasi Distribusi Topik dalam Dokumen
topic_model.visualize_barchart()