In [1]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize

# Load the dataset from a text file
file_path = "dataset\Pidato Presiden Prabowo 2024_convert.txt"

with open(file_path, "r", encoding="utf-8") as file:
    documents = file.readlines()

# Remove any leading/trailing whitespace
documents = [doc.strip() for doc in documents]

# Load IndoBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-large-p2")
model = AutoModel.from_pretrained("indobenchmark/indobert-large-p2")

# Compute embeddings
def compute_embeddings(documents):
    embeddings = []
    for doc in documents:
        inputs = tokenizer(doc, return_tensors='pt',
                           padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
            # CLS token embedding
            cls_embedding = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embedding.squeeze().numpy())
    return np.array(embeddings)

# Get embeddings for the documents
embeddings = compute_embeddings(documents)

# Normalize embeddings for better clustering
embeddings = normalize(embeddings)

# Clustering using KMeans
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(embeddings)

# Organize and print the results
clustered_docs = {i: [] for i in range(n_clusters)}
for doc, label in zip(documents, labels):
    clustered_docs[label].append(doc)

# Display results
for cluster, docs in clustered_docs.items():
    print(f"Cluster {cluster}:")
    for doc in docs:
        print(f"  - {doc}")
    print()


Cluster 0:
  - "Yang saya hormati dan yang saya muliakan",
  - "Ketua dan para Wakil Ketua Dewan Perwakilan Rakyat (DPR)",
  - "Ketua dan para Wakil Ketua Dewan Perwakilan Daerah (DPD)",
  - "Ketua Lembaga-Lembaga Negara",
  - "Perdana Menteri Republik Korea Yang Mulia  Han Duck-soo",
  - "Perdana Menteri Republik Singapura Yang Mulia  Lawrence Wong beserta Ibu Loo Tze Lui",
  - "Perdana Menteri Republik Vanuatu Yang Mulia Charlot Salwai Tabimasmas beserta Ibu Marine Justine Salwai",
  - "Wakil Presiden Republik Rakyat Tiongkok Yang Mulia Han Zeng",
  - "Wakil Presiden Republik Demokratik Rakyat Laos Yang Mulia Pany Yathotou",
  - "Wakil Presiden Republik Sosialis Vietnam Yang Mulia Vo Thi Anh Xuan",
  - "Wakil Pertama Perdana Menteri Federasi Rusia Yang Mulia Denis Manturov",
  - "Wakil Perdana Menteri Selandia Baru Yang Mulia Winston Peters",
  - "Wakil Perdana Menteri sekaligus Menteri Transportasi Kerajaan Thailand Suriya Jungrungreangkit",
  - "Utusan khusus dan Mantan Presiden Re