# **Import Libraries**

In [1]:
import pandas as pd
import numpy as np
import re
import time
import random

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan
import torch

import nlp_id
from nlp_id.tokenizer import Tokenizer
from nlp_id.stopword import StopWord

from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# os.chdir("/home/jovyan/serpens_testing/runs")  
os.chdir("/Users/alicia.siahaya/Documents/Alice Tiket 2025/Thesis_Modeling/Thesis/")

In [3]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# **Prepare Dataset**

In [4]:
df_modeling = pd.read_csv('src/data/df_modeling_BERT.csv')
df_modeling['sentiment'] = df_modeling['sentiment'].map({'positive': 1, 'negative': 0})
df_modeling.head()

Unnamed: 0,cleaned_text,sentiment
0,saya suka materi yang sudah disiapkan oleh pihak kampus karena memudahkan mahasiswa saya juga menyukai program enrichment yang disediakan kampus saya sehingga mahasiswa dapat belajar di ruang lingkup yang lebih luas,1
1,bisa bertemu dengan teman teman baru dan mendapatkan koneksi serta mendapatkan pelajaran yang berguna bagi saya kedepan nya,1
2,saya suka dengan makanan yang ada di dalam kampus saya terutama bakmi efata selain itu disekitar kampus juga banyak makanan enak,1
3,fasilitas kampus alam sutera sangat bagus pelajaran lab diajarkan oleh asisten yang sangat mengerti materi,1
4,saya suka dengan pertemanan nya solid mau saling bantu satu sama lain bagi bagi kisi kisi pas ujian terus saling ngajarin,1


In [5]:
from utils.topic_prediction import *

texts_pos, texts_neg = prepare_dataset(df_modeling)

# **Multiple Tuning**

In [11]:
embedding_models = [
    # "indobenchmark/indobert-base-p1",
    "LazarusNLP/all-indo-e5-small-v4",
    "paraphrase-multilingual-MiniLM-L12-v2",
    "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
]

umap_params = [
    {"n_neighbors": 7, "min_dist": 0.0, "metric": "cosine"},
    {"n_neighbors": 5, "min_dist": 0.1, "metric": "euclidean"},
    {"n_neighbors": 5, "min_dist": 0.0, "metric": "cosine"},
    {"n_neighbors": 7, "min_dist": 0.1, "metric": "euclidean"},
    {"n_neighbors": 10, "min_dist": 0.1, "metric": "cosine"},
    {"n_neighbors": 15, "min_dist": 0.3, "metric": "euclidean"}
]

hdbscan_params = [
    {"min_cluster_size": 10, "min_samples": 3, "cluster_selection_epsilon": 0.0},
    {"min_cluster_size": 5, "min_samples": 3, "cluster_selection_epsilon": 0.5},
    {"min_cluster_size": 5, "min_samples": 3, "cluster_selection_epsilon": 0.3},
    {"min_cluster_size": 10, "min_samples": 3, "cluster_selection_epsilon": 0.3},
    {"min_cluster_size": 15, "min_samples": 5, "cluster_selection_epsilon": 0.0},
    {"min_cluster_size": 10, "min_samples": 5, "cluster_selection_epsilon": 0.5},
]

bertopic_params = [
    {"top_n_words": 5, "nr_topics": 3},
    {"top_n_words": 5, "nr_topics": 4},
    {"top_n_words": 5, "nr_topics": 5},
    {"top_n_words": 5, "nr_topics": 6},
    {"top_n_words": 10, "nr_topics": 3},
    {"top_n_words": 10, "nr_topics": 4},
    {"top_n_words": 10, "nr_topics": 5},
    {"top_n_words": 10, "nr_topics": 6},
]

In [12]:
from utils.topic_evaluation import *

def run_topic_modeling(label):
    all_start_time = time.time()

    texts = texts_pos if label == "positive" else texts_neg
    results = []

    for emb in embedding_models:
        embedding_model = SentenceTransformer(emb)

        for u in umap_params:
            umap_model = None if u is None else UMAP(**{**u, "random_state": 42})

            for h in hdbscan_params:
                hdbscan_model = None if h is None else hdbscan.HDBSCAN(**h)

                for b in bertopic_params:
                    model_start_time = time.time()

                    topic_model = BERTopic(
                        embedding_model=embedding_model,
                        umap_model=umap_model,
                        hdbscan_model=hdbscan_model,
                        top_n_words=b["top_n_words"],
                        nr_topics=b["nr_topics"],
                        verbose=False
                    )

                    topics, probs = topic_model.fit_transform(texts)

                    # Evaluate topics
                    coherence_scores, irbo_score = evaluate_topics(texts, topic_model)

                    # Filter topics
                    topics_dict = topic_model.get_topics()
                    topics_filtered = {k: v for k, v in topics_dict.items() if k != -1 and v is not None}

                    model_time = round(time.time() - model_start_time, 2)
                    
                    # Create entry
                    results.append({
                        "Label": label,
                        "Embedding Model": emb,
                        "UMAP": u,
                        "HDBSCAN": h,
                        "BERTopic Params": b,
                        "Num Topics": len(topics_filtered),
                        "Topics List": [[word for word, _ in words] for words in topics_filtered.values()],
                        **coherence_scores,
                        "IRBO": irbo_score,
                        "Model Time (s)": model_time
                    })

                    print(f"[✓] Finished model - Embedding: {emb}, UMAP: {u}, HDBSCAN: {h}, BERTopic Params: {b} | Time: {model_time}s")

    total_time = round(time.time() - all_start_time, 2)
    print(f"\nAll models completed in {total_time} seconds.")

    results_top5 = pd.DataFrame(results).sort_values(by='c_v', ascending=False).head()

    # Saving the top5 models
    for i in range(len(results_top5)):
        emb = results_top5['Embedding Model'].iloc[i]
        u = results_top5['UMAP'].iloc[i]
        h = results_top5['HDBSCAN'].iloc[i]
        b = results_top5['BERTopic Params'].iloc[i]

        embedding_model = SentenceTransformer(emb)
        umap_model = None if u is None else UMAP(**{**u, "random_state": 42})
        hdbscan_model = None if h is None else hdbscan.HDBSCAN(**h)

        # Fit the BERTopic model with the top params
        bertopic_model = BERTopic(
            embedding_model=embedding_model,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            top_n_words=b["top_n_words"],
            nr_topics=b["nr_topics"],
            verbose=False
        )

        bertopic_model.fit(texts)

        model_save_path = f'src/models/bertopic_top3/{label}/best_{label}_model_top{i+1}'
        embedding_save_path = f'src/models/bertopic_top3/{label}/embeddings_top{i+1}'

        embedding_model.save(embedding_save_path)
        bertopic_model.save(
            model_save_path,
            serialization="safetensors",
        )
        print(f"Model saved to {model_save_path}, embedding saved to {embedding_save_path}")

    return results

### **Positive**

In [13]:
positive_results = run_topic_modeling("positive")

Coherence (c_v): 0.577
Coherence (u_mass): -2.922
Coherence (c_uci): 0.053
Coherence (c_npmi): 0.129
IRBO Topic Diversity: 0.843
[✓] Finished model - Embedding: LazarusNLP/all-indo-e5-small-v4, UMAP: {'n_neighbors': 7, 'min_dist': 0.0, 'metric': 'cosine'}, HDBSCAN: {'min_cluster_size': 10, 'min_samples': 3, 'cluster_selection_epsilon': 0.0}, BERTopic Params: {'top_n_words': 5, 'nr_topics': 3} | Time: 31.73s
Coherence (c_v): 0.564
Coherence (u_mass): -2.453
Coherence (c_uci): 0.087
Coherence (c_npmi): 0.1
IRBO Topic Diversity: 0.874
[✓] Finished model - Embedding: LazarusNLP/all-indo-e5-small-v4, UMAP: {'n_neighbors': 7, 'min_dist': 0.0, 'metric': 'cosine'}, HDBSCAN: {'min_cluster_size': 10, 'min_samples': 3, 'cluster_selection_epsilon': 0.0}, BERTopic Params: {'top_n_words': 5, 'nr_topics': 4} | Time: 11.88s
Coherence (c_v): 0.574
Coherence (u_mass): -2.815
Coherence (c_uci): -0.254
Coherence (c_npmi): 0.094
IRBO Topic Diversity: 0.954
[✓] Finished model - Embedding: LazarusNLP/all-ind

In [None]:
# Convert to DataFrame
df_pos = pd.DataFrame(positive_results)

# If topics are lists, convert to string to make them readable in Excel
df_pos["Topics List"] = df_pos["Topics List"].apply(lambda x: str(x))

# Export to Excel
save_path = 'results/bertopic_top3/'
os.makedirs(save_path, exist_ok=True)
df_pos.sort_values(by="c_v", ascending=False).to_excel(os.path.join(save_path, 'positive_topic_modeling_results.xlsx'), index=False)
df_pos.sort_values(by="c_v", ascending=False).to_csv(os.path.join(save_path, 'positive_topic_modeling_results.csv'), index=False)

### **Negative**

In [17]:
negative_results = run_topic_modeling("negative")

Coherence (c_v): 0.629
Coherence (u_mass): -1.69
Coherence (c_uci): 0.412
Coherence (c_npmi): 0.132
IRBO Topic Diversity: 1.0
[✓] Finished model - Embedding: LazarusNLP/all-indo-e5-small-v4, UMAP: {'n_neighbors': 7, 'min_dist': 0.0, 'metric': 'cosine'}, HDBSCAN: {'min_cluster_size': 10, 'min_samples': 3, 'cluster_selection_epsilon': 0.0}, BERTopic Params: {'top_n_words': 5, 'nr_topics': 3} | Time: 15.38s
Coherence (c_v): 0.624
Coherence (u_mass): -1.92
Coherence (c_uci): 0.476
Coherence (c_npmi): 0.132
IRBO Topic Diversity: 0.914
[✓] Finished model - Embedding: LazarusNLP/all-indo-e5-small-v4, UMAP: {'n_neighbors': 7, 'min_dist': 0.0, 'metric': 'cosine'}, HDBSCAN: {'min_cluster_size': 10, 'min_samples': 3, 'cluster_selection_epsilon': 0.0}, BERTopic Params: {'top_n_words': 5, 'nr_topics': 4} | Time: 13.1s
Coherence (c_v): 0.674
Coherence (u_mass): -1.802
Coherence (c_uci): 0.547
Coherence (c_npmi): 0.154
IRBO Topic Diversity: 0.922
[✓] Finished model - Embedding: LazarusNLP/all-indo-e5

In [None]:
# Convert to DataFrame
df_neg = pd.DataFrame(negative_results)

# If topics are lists, convert to string to make them readable in Excel
df_neg["Topics List"] = df_neg["Topics List"].apply(lambda x: str(x))

# Export to Excel
save_path = 'results/bertopic_top3/'
os.makedirs(save_path, exist_ok=True)
df_neg.sort_values(by="c_v", ascending=False).to_excel(os.path.join(save_path, 'negative_topic_modeling_results.xlsx'), index=False)
df_neg.sort_values(by="c_v", ascending=False).to_csv(os.path.join(save_path, 'negative_topic_modeling_results.csv'), index=False)

# **IGNORE CODES BELOW**

## **BERTopic**

### Create Model

**SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")**

In [None]:
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

topic_model_pos = BERTopic(embedding_model=embedding_model, verbose=True)
topic_model_neg = BERTopic(embedding_model=embedding_model, verbose=True)

### Fit Model

In [None]:
topics_pos, probs_pos = topic_model_pos.fit_transform(texts_pos)

2025-08-06 15:53:11,588 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

2025-08-06 15:53:16,408 - BERTopic - Embedding - Completed ✓
2025-08-06 15:53:16,417 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-08-06 15:53:23,884 - BERTopic - Dimensionality - Completed ✓
2025-08-06 15:53:23,886 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-06 15:53:23,899 - BERTopic - Cluster - Completed ✓
2025-08-06 15:53:23,903 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-08-06 15:53:23,916 - BERTopic - Representation - Completed ✓


In [None]:
topics_neg, probs_neg = topic_model_neg.fit_transform(texts_neg)

2025-08-06 15:53:23,933 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

2025-08-06 15:53:38,898 - BERTopic - Embedding - Completed ✓
2025-08-06 15:53:38,984 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-08-06 15:53:40,198 - BERTopic - Dimensionality - Completed ✓
2025-08-06 15:53:40,284 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-06 15:53:40,296 - BERTopic - Cluster - Completed ✓
2025-08-06 15:53:40,300 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-08-06 15:53:40,317 - BERTopic - Representation - Completed ✓


### Inspect Topics

In [None]:
topic_model_pos.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,289,0_teman_kampus_dosen_materi,"[teman, kampus, dosen, materi, belajar, perkuliahan, lingkungan, kuliah, mahasiswa, hal]","[dosen dosen kampus friendly materi mudah dipelajari teman teman pelajaran perkuliahan, perkuliahan lingkungan kampus nyaman kondusif belajar teman teman suportif aktif kegiatan pengalaman kuliah menyenangkan, teman teman mendukung membantu perkuliahan teman teman belajar bareng memahami materi]"
1,1,12,1_makanan_kampus_kantin_fasilitas,"[makanan, kampus, kantin, fasilitas, teman, area, makan, peluang, pilihan, enak]","[makanan fasilitas wilayah kampus kampus terpenuhi, daerah kampus makanan explore makanan jajan teman teman terdekat seru gak nyangka circle real sma, makanan kampus bakmi efata disekitar kampus makanan enak]"


In [None]:
topic_model_neg.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4,-1_praktikum_makan_kantin_nyaman,"[praktikum, makan, kantin, nyaman, sepi, krg, ribet, menjalankan, tua, alat]","[kursi kampus nyaman gedung tua wifi tugas ribet, kondisi kantin panas hal krg nyaman makan kantin memilih makan, alat ruang praktikum lengkap menjalankan praktikum]"
1,0,187,0_dosen_kampus_mahasiswa_kelas,"[dosen, kampus, mahasiswa, kelas, kuliah, materi, hal, fasilitas, mata, tugas]","[fasilitas mahasiswa tugas kelas berukuran besar mengurangi interaksi dosen mahasiswa, perkuliahan hal i jadwal perkuliahan padat tugas tugas menumpuk bersamaan hal sulit membagi optimal mata kuliah penyampaian materi menarik aplikatif proses belajar menyenangkan berusaha mengambil pelajaran pengalaman, dosen dosen memahami materi mengajar sulit mengajarkan mahasiswa]"
2,1,49,1_wifi_kampus_koneksi_susah,"[wifi, kampus, koneksi, susah, absensi, jaringan, internet, fasilitas, terkoneksi, lambat]","[fasilitas kampus wifi terputus koneksi jaringan lancar, koneksi jaringan wifi kampus wifi terkonek aktivitas kampus kesulitan termaksa memakai data hotspot teman aktivitas krusial absensi kelas wifi, wifi disediakan kampus koneksi jaringan kesulitan tugas absen]"
3,2,31,2_toilet_fasilitas_tisu_tissue,"[toilet, fasilitas, tisu, tissue, bersih, kampus, sabun, tangan, nyaman, kotor]","[dosen ramah toilet lantai kotor toilet gedung bersih toilet gedung penuh, fasilitas kuliah toilet parkir toilet bersih kekurangan tisu parkir teratur penuh, fasilitas toilet disediakan tisu toilet bersih kantin kampus antrian beli makan]"
4,3,30,3_toilet_wifi_tisu_fasilitas,"[toilet, wifi, tisu, fasilitas, kampus, bersih, lambat, terkoneksi, gedung, mati]","[fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus, fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus, fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus]"


### Visualize Topic

topic_model_pos.visualize_barchart()

topic_model_neg.visualize_barchart()

topic_model_neg.visualize_topics()

topic_model_pos.visualize_documents(texts_pos)

topic_model_neg.visualize_documents(texts_neg)

## **TUNING**

**SentenceTransformer("indobenchmark/indobert-base-p1")**

We can have another choice of this fine tuned model: 

```SentenceTransformer("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")```

In [None]:
umap_model = UMAP(n_neighbors=7, min_dist=0.0, metric="cosine")
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=None, cluster_selection_epsilon=0.0)
# embedding_model = SentenceTransformer("indobenchmark/indobert-base-p1")
embedding_model = SentenceTransformer("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")

topic_model_pos_tuned = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True
)

topic_model_neg_tuned = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True
)

In [None]:
topics_pos_tuned, probs_pos_tuned = topic_model_pos_tuned.fit_transform(texts_pos)

2025-08-06 15:53:43,428 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

2025-08-06 15:53:55,715 - BERTopic - Embedding - Completed ✓
2025-08-06 15:53:55,717 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-08-06 15:53:56,391 - BERTopic - Dimensionality - Completed ✓
2025-08-06 15:53:56,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-06 15:53:56,401 - BERTopic - Cluster - Completed ✓
2025-08-06 15:53:56,483 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-08-06 15:53:56,508 - BERTopic - Representation - Completed ✓


In [None]:
topics_neg_tuned, probs_neg_tuned = topic_model_neg_tuned.fit_transform(texts_neg)

2025-08-06 15:53:56,554 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

2025-08-06 15:54:07,886 - BERTopic - Embedding - Completed ✓
2025-08-06 15:54:07,887 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-08-06 15:54:08,598 - BERTopic - Dimensionality - Completed ✓
2025-08-06 15:54:08,684 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-06 15:54:08,694 - BERTopic - Cluster - Completed ✓
2025-08-06 15:54:08,698 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-08-06 15:54:08,715 - BERTopic - Representation - Completed ✓


### View Topics

In [None]:
topic_model_pos_tuned.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,58,-1_teman_perkuliahan_kampus_materi,"[teman, perkuliahan, kampus, materi, event, pelajaran, mudah, belajar, organisasi, dosen]","[pengalaman organisasi ikuti perkuliahan mengikuti organisasi event event pengalaman teman teman, dosen dosen kampus friendly materi mudah dipelajari teman teman pelajaran perkuliahan, lingkungan pertemanan temui perkuliahan supportive so far menjatuhkan merugikan opportunity disediakan kampus menambah penghasilan membuka lowongan pekerjaan part time full time mahasiswa matang dunia kerja lulus proses pembelajaran sebenar binus nyaman fasilitas infrastruktur materi kurikulum dipahami sesuai standar terbaru tren terbaru program program unit unit layanan konseling layanan riset membantu mahasiwa kebutuhan individu berbeda beda inti pengalaman pengalaman dapatkan kuliah bertemu circle pertemanan membangun solid fl fp melatih public speaking join ukm himpunan belajar project event management teamwork part time magang it division bina nusantara berperan membentuk binus menghasilkan lulusan berkualitas salah program event partnership penelitian ditawarkan murid aktif explore gtu ambis setuju setuju binus dibilang bagus]"
1,0,58,0_kelas_kuliah_kegiatan_kesempatan,"[kelas, kuliah, kegiatan, kesempatan, kerja, mata, hal, organisasi, mahasiswa, diskusi]","[berdiskusi kelas dosen menambah wawasan melatih berpikir kritis menikmati kerja kelompok belajar kerja mengenal karakter teman, perkuliahan menikmati suasana diskusi kelas mendorong berpikir kritis terbuka sudut pandang kesempatan terlibat organisasi kemahasiswaan kegiatan mengembangkan keterampilan kepemimpinan tim memperluas jaringan pertemanan lintas jurusan angkatan pengalaman pengalaman memperkaya wawasan akademik membentuk karakter kesiapan menghadapi dunia profesional, perkuliahan kesempatan belajar hal hal relevan minat menikmati diskusi diskusi kelas membuka wawasan suasana akademik mendorong berpikir kritis berkembang pribadi senang terlibat kegiatan organisasi proyek kolaboratif memperluas jaringan keterampilan kelas]"
2,1,42,1_lingkungan_pertemanan_perkuliahan_nyaman,"[lingkungan, pertemanan, perkuliahan, nyaman, kuliah, mendukung, kampus, akademik, menyenangkan, belajar]","[suasana lingkungan pertemanan kuliah belajar memiliki teman keluarga lingkungan, perkuliahan lingkungan kampus nyaman kondusif belajar teman teman suportif aktif kegiatan pengalaman kuliah menyenangkan, lingkungan kampus mendukung mahasiswa mendalami ilmu sesi perkuliahan kegiatan lingkungan positif teman teman dosen suportif perkuliahan salah fasilitas kampus mendukung kesehatan mental curhat psikolog kampus gratis]"
3,2,25,2_fasilitas_kampus_makanan_mahasiswa,"[fasilitas, kampus, makanan, mahasiswa, disediakan, peluang, program, memadai, magang, tugas]","[fasilitas kampus bagus program kemahasiswaan mendukung mahasiswa belajar, fasilitas disediakan kampus lengkap memadai dibandingkan kampus indonesia bermimpi memiliki fasilitas dosen mengajar memiliki kualitas kampus mensyaratkan gelar minimal s mengajar materi relevan mendalam menyadari dosen memiliki kemampuan mengajar memadai teman teman suportif menemukan kelompok cocok suasana belajar nyaman, fasilitas disediakan kampus lengkap mendukung proses belajar perpustakaan nyaman ruang diskusi bersantai fasilitas belajar efektif tugas tugas perkuliahan maksimal]"
4,3,19,3_materi_langsung_pembelajaran_memahami,"[materi, langsung, pembelajaran, memahami, hal, pelajaran, mata, science, mengakses, mengerti]","[sistem pembelajaran binus terstruktur teregulasi mencari materi pertemuan mengakses langsung binusmaya, materi berkaitan machine learning artificial intelligence pengembangan web langsung mengimplementasikan teori proyek nyata hal terlibat termotivasi belajar hasil langsung diuji praktis, materi materi dibilang terstruktur mudah memahami mata pelajaran jurusan senang sistem binus merekam kelas mengakses informasi tangkap kelas]"
5,4,17,4_teman_pengalaman_lingkungan_kakak,"[teman, pengalaman, lingkungan, kakak, sehat, asik, tingkat, temen, dapatkan, kuliah]","[pengalaman dapatkan menjalani perkuliahan memiliki teman teman suportif pengalaman mengikuti perlombaan paduan, pengalaman pengalaman bertemu orang sejurusan dosen pengalaman pengalaman ' kuliah ' i pengalaman berkuliah sun shine and rainbow proses, teman teman jajanan lingkungan teman asik lingkungan bagus kakak tingkat supportive]"
6,5,13,5_bertemu_orang_koneksi_teman,"[bertemu, orang, koneksi, teman, seru, latar, memperbanyak, berteman, pikiran, bertukar]","[suasana bertemu teman nongkrong seru bebas asik seru mengenal orang, bertemu teman teman menambah relasi koneksi orang temui, bertemu orang orang latar hal bertemu orang orang hebat support]"
7,6,8,6_materi_dosen_terstruktur_mudah,"[materi, dosen, terstruktur, mudah, kampus, mengerti, mencerna, membimbing, sekeliling, jelasin]","[dosen dosen kampus materi terstruktur, dosen dosen kampus materi terstruktur mengerti materi mudah, dosen dosen kampus materi terstruktur teman teman mengerti materi mudah]"
8,7,8,7_teman_perkuliahan_fun_matkul,"[teman, perkuliahan, fun, matkul, temui, jenjang, san, design, canggung, kaku]","[kelas fun dosen membawakan materi seru main teman teman, matkul jurusan design didukung dosen pemaparan materi kaku teman teman enjoy kelas matkul, teman teman orang orang temui jenjang perkuliahan membantu perkuliahan fun]"
9,8,8,8_kampus_teman_berinteraksi_easy,"[kampus, teman, berinteraksi, easy, seangkatan, ikatan, going, sosialisasi, mempelajari, strategis]","[lokasi kampus strategis teman teman seangkatan seru kompak, pertemanan kampus teman teman kampus easy going perkuliahan bosan, berinteraksi teman teman kampus kelas kelas berinteraksi semangat menjalani kampus]"


In [None]:
topic_model_neg_tuned.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,244,0_kampus_dosen_mahasiswa_fasilitas,"[kampus, dosen, mahasiswa, fasilitas, wifi, kuliah, hal, kelas, materi, tugas]","[fasilitas mahasiswa tugas kelas berukuran besar mengurangi interaksi dosen mahasiswa, dosen dosen memahami materi mengajar sulit mengajarkan mahasiswa, aksesibilitas kampus rumit nyaman menunggu mata kuliah nyaman koneksi wifi kampus sulit absen kuliah]"
1,1,50,1_toilet_tisu_fasilitas_kampus,"[toilet, tisu, fasilitas, kampus, wifi, bersih, gedung, lambat, pressure, memadai]","[fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus, fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus, fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus]"
2,2,7,2_tissue_toilet_fasilitas_jorok,"[tissue, toilet, fasilitas, jorok, nyaman, ketidaklengkapan, diperhatikan, disaat, kaca, diserap]","[fasilitas toilet tissue ruangan kelas panas kaca, toilet tissue jorok dikit malas toilet, fasilitas toilet kampus jorok nyaman tissue nyaman]"


topic_model_pos_tuned.visualize_barchart().show()

topic_model_neg_tuned.visualize_barchart().show()

topic_model_pos_tuned.visualize_documents(texts_pos)

topic_model_neg_tuned.visualize_documents(texts_neg)

## **Evaluation**

**Dokumentasi tuning**: https://docs.google.com/spreadsheets/d/1PG2Y5-kA4ENGeOc-iRbEOYQKpEy0A7wBGyYU8PS_dOs/edit?gid=529725999#gid=529725999

run this but don't restart the session

In [None]:
from utils.topic_evaluation import calculate_coherence_score, calculate_irbo, evaluate_topics

In [None]:
def get_all_topics(topic_model_pos, topic_model_neg):
    topics_pos = topic_model_pos.get_topics()
    topics_neg = topic_model_neg.get_topics()

    all_topics_pos = [[word for word, _ in word_list] for word_list in topics_pos.values()]
    all_topics_neg = [[word for word, _ in word_list] for word_list in topics_neg.values()]

    print("="*5,"POSITIVE", "="*5)
    print(len(all_topics_pos))
    print(all_topics_pos)
    print("\n", "="*5,"NEGATIVE", "="*5)
    print(len(all_topics_neg))
    print(all_topics_neg)

    return all_topics_pos, all_topics_neg

all_topics_pos, all_topics_neg = get_all_topics(topic_model_pos, topic_model_neg)

===== POSITIVE =====
2
[['teman', 'kampus', 'dosen', 'materi', 'belajar', 'perkuliahan', 'lingkungan', 'kuliah', 'mahasiswa', 'hal'], ['makanan', 'kampus', 'kantin', 'fasilitas', 'teman', 'area', 'makan', 'peluang', 'pilihan', 'enak']]

 ===== NEGATIVE =====
5
[['praktikum', 'makan', 'kantin', 'nyaman', 'sepi', 'krg', 'ribet', 'menjalankan', 'tua', 'alat'], ['dosen', 'kampus', 'mahasiswa', 'kelas', 'kuliah', 'materi', 'hal', 'fasilitas', 'mata', 'tugas'], ['wifi', 'kampus', 'koneksi', 'susah', 'absensi', 'jaringan', 'internet', 'fasilitas', 'terkoneksi', 'lambat'], ['toilet', 'fasilitas', 'tisu', 'tissue', 'bersih', 'kampus', 'sabun', 'tangan', 'nyaman', 'kotor'], ['toilet', 'wifi', 'tisu', 'fasilitas', 'kampus', 'bersih', 'lambat', 'terkoneksi', 'gedung', 'mati']]


### 1. Topic Coherence

https://radimrehurek.com/gensim/models/coherencemodel.html

In [None]:
print("Topic Coherence Score for Positive Sentiment")
positive_coherence_scores = calculate_coherence_score(
    texts_pos, all_topics_pos, print_results=True
)
print("Topic Coherence Score for Negative Sentiment")
negative_coherence_scores = calculate_coherence_score(
    texts_neg, all_topics_neg, print_results=True
)

Topic Coherence Score for Positive Sentiment
c_v: 0.314
u_mass: -6.161
c_uci: -2.8
c_npmi: -0.04
Topic Coherence Score for Negative Sentiment
c_v: 0.43
u_mass: -6.353
c_uci: -3.342
c_npmi: -0.018


### 2. Topic Diversity

https://github.com/silviatti/topic-model-diversity/blob/master/diversity_metrics.py

In [None]:
positive_irbo_scores = calculate_irbo(all_topics_pos, print_results=True)
print(f"Topic Diversity Score for Positive Sentiment: {positive_irbo_scores}")
negative_irbo_scores = calculate_irbo(all_topics_neg, print_results=True)
print(f"Topic Diversity Score for Negative Sentiment {negative_irbo_scores}")

IRBO: 0.723
Topic Diversity Score for Positive Sentiment: 0.723
IRBO: 0.842
Topic Diversity Score for Negative Sentiment 0.842


## Visualization (Ignore below)

In [44]:
def visualize_top_models(top_models_df, label, texts):
    """
    Evaluasi dan visualisasi top 5 model BERTopic dengan export Excel

    Parameters:
    - top_models_df: DataFrame berisi top models yang sudah diurutkan berdasarkan c_v score
    - label: Label untuk data ('pos' atau 'neg')
    - texts: List dokumen teks untuk training
    """

    # Create Excel workbook untuk semua model
    wb = Workbook()
    wb.remove(wb.active)  # Remove default sheet

    for idx, row in top_models_df.iterrows():
        print(f"\n==========================")
        print(f"Top Model {idx+1} ({label.upper()} Data)")
        print(f"==========================")

        # Load components
        emb_model_name = row["Embedding Model"]
        emb_model = SentenceTransformer(emb_model_name)
        umap_params = row["UMAP"]
        hdbscan_params = row["HDBSCAN"]
        bertopic_params = row["BERTopic Params"]
        cv_score = row["c_v"]

        # Print full parameter config
        print("Parameters:")
        print(f"Embedding Model  : {emb_model_name}")
        print(f"UMAP Params      : {umap_params}")
        print(f"HDBSCAN Params   : {hdbscan_params}")
        print(f"BERTopic Params  : {bertopic_params}")
        print(f"C_V Coherence Score: {cv_score:.4f}")

        # Build models
        umap_model = None if pd.isna(umap_params) else UMAP(**umap_params)
        hdbscan_model = None if pd.isna(hdbscan_params) else hdbscan.HDBSCAN(**hdbscan_params)

        # Rebuild and fit model
        topic_model = BERTopic(
            embedding_model=emb_model,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            top_n_words=bertopic_params["top_n_words"],
            nr_topics=bertopic_params["nr_topics"]
        )

        topics, probs = topic_model.fit_transform(texts)

        # Topic Info
        topic_info = topic_model.get_topic_info()
        print("\nTopic Info:")
        display(topic_info)

        # Visualizations
        print("\nBarchart:")
        topic_model.visualize_barchart(top_n_topics=6).show()

        print("\nDocument Visualization:")
        topic_model.visualize_documents(texts).show()

        # ========== NEW: Excel Export Functionality ==========

        # Create worksheet untuk model ini
        ws_name = f"Model_{idx+1}_{label}"
        ws = wb.create_sheet(title=ws_name)

        # Header untuk informasi model
        ws['A1'] = f"Model {idx+1} - {label.upper()} Data"
        ws['A1'].font = Font(bold=True, size=14)

        ws['A2'] = f"C_V Score: {cv_score:.4f}"
        ws['A2'].font = Font(bold=True)

        ws['A3'] = f"Embedding Model: {emb_model_name}"
        ws['A4'] = f"UMAP Params: {str(umap_params)}"
        ws['A5'] = f"HDBSCAN Params: {str(hdbscan_params)}"
        ws['A6'] = f"BERTopic Params: {str(bertopic_params)}"

        # Kosongkan beberapa baris
        current_row = 8

        # Header untuk topic analysis
        ws[f'A{current_row}'] = "Topic Analysis with Sample Documents"
        ws[f'A{current_row}'].font = Font(bold=True, size=12)
        current_row += 2

        # Process setiap topic
        for topic_id in topic_info['Topic'].values:
            if topic_id == -1:  # Skip outlier topic
                continue

            # Get topic information
            topic_words = topic_model.get_topic(topic_id)
            topic_name = f"Topic {topic_id}"
            topic_keywords = ", ".join([word for word, _ in topic_words[:10]])

            # Get documents untuk topic ini
            topic_docs_indices = [i for i, topic in enumerate(topics) if topic == topic_id]
            topic_documents = [texts[i] for i in topic_docs_indices]

            # Random sampling 20 dokumen atau semua jika kurang dari 20
            if len(topic_documents) > 20:
                sample_docs = random.sample(topic_documents, 20)
            else:
                sample_docs = topic_documents

            # Write topic header
            ws[f'A{current_row}'] = topic_name
            ws[f'A{current_row}'].font = Font(bold=True)
            ws[f'A{current_row}'].fill = PatternFill(start_color="E6E6FA", end_color="E6E6FA", fill_type="solid")

            ws[f'B{current_row}'] = f"Keywords: {topic_keywords}"
            ws[f'B{current_row}'].font = Font(italic=True)

            ws[f'C{current_row}'] = f"Total Docs: {len(topic_documents)}"
            ws[f'D{current_row}'] = f"Sample Size: {len(sample_docs)}"

            current_row += 1

            # Column headers untuk documents
            ws[f'A{current_row}'] = "No."
            ws[f'B{current_row}'] = "Document Text"
            ws[f'C{current_row}'] = "Relevant? (Y/N)"
            ws[f'D{current_row}'] = "Notes"

            # Style headers
            for col in ['A', 'B', 'C', 'D']:
                ws[f'{col}{current_row}'].font = Font(bold=True)
                ws[f'{col}{current_row}'].fill = PatternFill(start_color="D3D3D3", end_color="D3D3D3", fill_type="solid")

            current_row += 1

            # Write sample documents
            for doc_idx, doc in enumerate(sample_docs, 1):
                ws[f'A{current_row}'] = doc_idx
                ws[f'B{current_row}'] = doc[:500] + "..." if len(doc) > 500 else doc  # Limit panjang teks
                ws[f'C{current_row}'] = ""  # Untuk manual evaluation
                ws[f'D{current_row}'] = ""  # Untuk catatan

                # Wrap text untuk column B
                ws[f'B{current_row}'].alignment = Alignment(wrap_text=True, vertical='top')

                current_row += 1

            current_row += 2  # Space antar topics

        # Adjust column widths
        ws.column_dimensions['A'].width = 5
        ws.column_dimensions['B'].width = 60
        ws.column_dimensions['C'].width = 15
        ws.column_dimensions['D'].width = 30

        print(f"✓ Data untuk Model {idx+1} telah ditambahkan ke Excel")

    # Save Excel file
    filename = f"bertopic_evaluation_{label}_top5_models.xlsx"
    wb.save(filename)
    print(f"\n✓ File Excel berhasil disimpan: {filename}")
    print(f"  File berisi {len(top_models_df)} worksheet untuk evaluasi manual")
    print(f"  Setiap worksheet berisi informasi parameter model dan sample dokumen per topik")

    return filename

In [47]:
visualize_top_models(top5_pos, "positive", texts_pos)


Top Model 394 (POSITIVE Data)


No sentence-transformers model found with name indobenchmark/indobert-base-p1. Creating a new one with mean pooling.


Parameters:
Embedding Model  : indobenchmark/indobert-base-p1
UMAP Params      : {'n_neighbors': 5, 'min_dist': 0.0, 'metric': 'cosine'}
HDBSCAN Params   : {'min_cluster_size': 5, 'min_samples': 3, 'cluster_selection_epsilon': 0.5}
BERTopic Params  : {'top_n_words': 5, 'nr_topics': 4}
C_V Coherence Score: 0.7510

Topic Info:


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,11,-1_kuliah_teman_lingkungan_sekolah,"[kuliah, teman, lingkungan, sekolah, pertemanan]","[jadwal kuliah sepenuh sekolah ekspansi komunitas pilihan orang kenalan sekolah mandiri transportasi ojol berpergian, lingkungan kuliah dorongan berkembang lingkungan membantu melaksanakan kegiatan perkuliahan kemahasiswaan menciptakan kondisi ideal mahasiswa, jam kuliah fleksibel kesempatan mahasiswa kuliah hal membantu mahasiswa kota memenuhi kebutuhan mengikuti perkuliahan hambatan]"
1,0,277,0_teman_kampus_dosen_materi,"[teman, kampus, dosen, materi, belajar]","[perkuliahan lingkungan kampus nyaman kondusif belajar teman teman suportif aktif kegiatan pengalaman kuliah menyenangkan, dosen dosen kampus materi terstruktur teman teman mengerti materi mudah, teman teman mendukung membantu perkuliahan teman teman belajar bareng memahami materi]"
2,1,7,1_makanan_kampus_area_hidup,"[makanan, kampus, area, hidup, nyaman]","[fasilitas kampus area kampus area hijau makanan, makanan kampus bakmi efata disekitar kampus makanan enak, menikmati makanan beragam lezat cita khas mudah ditemukan biaya hidup tergolong murah hidup nyaman mengeluarkan uang]"
3,2,6,2_peluang_binus_mahasiswa_indonesia,"[peluang, binus, mahasiswa, indonesia, keuntungan]","[binus salah kampus ternama indonesia alasan kuliah pengajaran fasilitas magang, peluang peluang ditawarkan kampus pilihan makanan, peluang universitas mahasiswa mahasiswa peluang lapak berkembang]"



Barchart:



Document Visualization:


✓ Data untuk Model 394 telah ditambahkan ke Excel

Top Model 540 (POSITIVE Data)


No sentence-transformers model found with name indobenchmark/indobert-base-p1. Creating a new one with mean pooling.


Parameters:
Embedding Model  : indobenchmark/indobert-base-p1
UMAP Params      : {'n_neighbors': 15, 'min_dist': 0.3, 'metric': 'euclidean'}
HDBSCAN Params   : {'min_cluster_size': 5, 'min_samples': 3, 'cluster_selection_epsilon': 0.5}
BERTopic Params  : {'top_n_words': 5, 'nr_topics': 6}
C_V Coherence Score: 0.7130

Topic Info:


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,16,-1_kampus_fasilitas_lengkap_teman,"[kampus, fasilitas, lengkap, teman, makanan]","[fasilitas kampus lengkap bazar makanan ga susah cari makan, hal perkuliahan dosen seru fasilitas lengkap acara, fasilitas disediakan kampus lengkap memadai dibandingkan kampus indonesia bermimpi memiliki fasilitas dosen mengajar memiliki kualitas kampus mensyaratkan gelar minimal s mengajar materi relevan mendalam menyadari dosen memiliki kemampuan mengajar memadai teman teman suportif menemukan kelompok cocok suasana belajar nyaman]"
1,0,8,0_fasilitas_belajar_kelas_akses,"[fasilitas, belajar, kelas, akses, ruangan]","[fasilitas perpustakaan kampus lengkap nyaman belajar mandiri kelompok koleksi buku jurnal mendukung pengerjaan tugas skripsi ditambah suasana tenang kondusif fokus, fasilitas disediakan kampus lengkap mendukung proses belajar perpustakaan nyaman ruang diskusi bersantai fasilitas belajar efektif tugas tugas perkuliahan maksimal, lingkungan kampus mendukung gaya hidup sehat tersedia air minum ulang gratis fasilitas pejalan kaki memadai akses transportasi mudah pulang pergi menghawatirkan akses transportasi internal dosen departemen terbuka aspirasi kritik saran dikabulkan]"
2,1,277,1_teman_kampus_dosen_materi,"[teman, kampus, dosen, materi, belajar]","[dosen dosen kampus friendly materi mudah dipelajari teman teman pelajaran perkuliahan, lingkungan teman teman dosen membantu pembelajaran, perkuliahan lingkungan kampus nyaman kondusif belajar teman teman suportif aktif kegiatan pengalaman kuliah menyenangkan]"



Barchart:



Document Visualization:


✓ Data untuk Model 540 telah ditambahkan ke Excel

Top Model 291 (POSITIVE Data)


No sentence-transformers model found with name indobenchmark/indobert-base-p1. Creating a new one with mean pooling.


Parameters:
Embedding Model  : indobenchmark/indobert-base-p1
UMAP Params      : {'n_neighbors': 7, 'min_dist': 0.0, 'metric': 'cosine'}
HDBSCAN Params   : {'min_cluster_size': 10, 'min_samples': 3, 'cluster_selection_epsilon': 0.0}
BERTopic Params  : {'top_n_words': 5, 'nr_topics': 5}
C_V Coherence Score: 0.6790

Topic Info:


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,9,-1_peluang_alat_mahasiswa_binus,"[peluang, alat, mahasiswa, binus, indonesia]","[ruangan praktikum alat alat ruangan belajar praktik materi pembelajaran, peluang peluang ditawarkan kampus pilihan makanan, peluang universitas mahasiswa mahasiswa peluang lapak berkembang]"
1,0,245,0_teman_dosen_kampus_materi,"[teman, dosen, kampus, materi, lingkungan]","[lingkungan teman teman dosen membantu pembelajaran, teman teman mendukung membantu perkuliahan teman teman belajar bareng memahami materi, dosen dosen kampus materi terstruktur teman teman mengerti materi mudah]"
2,1,31,1_fasilitas_kampus_makanan_lengkap,"[fasilitas, kampus, makanan, lengkap, belajar]","[fasilitas kampus bagus program kemahasiswaan mendukung mahasiswa belajar, fasilitas fasilitas kampus fasilitas memudahkan darurat menge print tugas proposal tersedia printing seberang kampus, fasilitas disediakan kampus lengkap mendukung proses belajar perpustakaan nyaman ruang diskusi bersantai fasilitas belajar efektif tugas tugas perkuliahan maksimal]"
3,2,16,2_diskusi_belajar_memiliki_pengalaman,"[diskusi, belajar, memiliki, pengalaman, dosen]","[perkuliahan kesempatan belajar hal hal relevan minat menikmati diskusi diskusi kelas membuka wawasan suasana akademik mendorong berpikir kritis berkembang pribadi senang terlibat kegiatan organisasi proyek kolaboratif memperluas jaringan keterampilan kelas, perkuliahan menikmati suasana diskusi kelas mendorong berpikir kritis terbuka sudut pandang kesempatan terlibat organisasi kemahasiswaan kegiatan mengembangkan keterampilan kepemimpinan tim memperluas jaringan pertemanan lintas jurusan angkatan pengalaman pengalaman memperkaya wawasan akademik membentuk karakter kesiapan menghadapi dunia profesional, lingkungan pertemanan temui perkuliahan supportive so far menjatuhkan merugikan opportunity disediakan kampus menambah penghasilan membuka lowongan pekerjaan part time full time mahasiswa matang dunia kerja lulus proses pembelajaran sebenar binus nyaman fasilitas infrastruktur materi kurikulum dipahami sesuai standar terbaru tren terbaru program program unit unit layanan konseling layanan riset membantu mahasiwa kebutuhan individu berbeda beda inti pengalaman pengalaman dapatkan kuliah bertemu circle pertemanan membangun solid fl fp melatih public speaking join ukm himpunan belajar project event management teamwork part time magang it division bina nusantara berperan membentuk binus menghasilkan lulusan berkualitas salah program event partnership penelitian ditawarkan murid aktif explore gtu ambis setuju setuju binus dibilang bagus]"



Barchart:



Document Visualization:


✓ Data untuk Model 291 telah ditambahkan ke Excel

Top Model 545 (POSITIVE Data)


No sentence-transformers model found with name indobenchmark/indobert-base-p1. Creating a new one with mean pooling.


Parameters:
Embedding Model  : indobenchmark/indobert-base-p1
UMAP Params      : {'n_neighbors': 15, 'min_dist': 0.3, 'metric': 'euclidean'}
HDBSCAN Params   : {'min_cluster_size': 5, 'min_samples': 3, 'cluster_selection_epsilon': 0.3}
BERTopic Params  : {'top_n_words': 5, 'nr_topics': 3}
C_V Coherence Score: 0.6790

Topic Info:


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,8,-1_fasilitas_bareng_memadai_memiliki,"[fasilitas, bareng, memadai, memiliki, lengkap]","[kualitas pendidikan dipegang kampus fasilitas kelas aplikasi memadai nyaman belajar kelas, fasilitas disediakan kampus lengkap memadai dibandingkan kampus indonesia bermimpi memiliki fasilitas dosen mengajar memiliki kualitas kampus mensyaratkan gelar minimal s mengajar materi relevan mendalam menyadari dosen memiliki kemampuan mengajar memadai teman teman suportif menemukan kelompok cocok suasana belajar nyaman, fleksibilitas ditawarkan berkuliah bebas mengatur jam belajar pribadi lokasi perkuliahan hal fasilitas zoom recording disediakan universitas]"
1,0,264,0_teman_dosen_materi_kampus,"[teman, dosen, materi, kampus, belajar]","[lingkungan teman teman dosen membantu pembelajaran, dosen dosen kampus materi terstruktur teman teman mengerti materi mudah, teman teman mendukung membantu perkuliahan teman teman belajar bareng memahami materi]"
2,1,29,1_fasilitas_kampus_makanan_lengkap,"[fasilitas, kampus, makanan, lengkap, memadai]","[makanan fasilitas wilayah kampus kampus terpenuhi, fasilitas disediakan kampus lengkap mendukung proses belajar perpustakaan nyaman ruang diskusi bersantai fasilitas belajar efektif tugas tugas perkuliahan maksimal, fasilitas fasilitas kampus fasilitas memudahkan darurat menge print tugas proposal tersedia printing seberang kampus]"



Barchart:



Document Visualization:


✓ Data untuk Model 545 telah ditambahkan ke Excel

Top Model 305 (POSITIVE Data)


No sentence-transformers model found with name indobenchmark/indobert-base-p1. Creating a new one with mean pooling.


Parameters:
Embedding Model  : indobenchmark/indobert-base-p1
UMAP Params      : {'n_neighbors': 7, 'min_dist': 0.0, 'metric': 'cosine'}
HDBSCAN Params   : {'min_cluster_size': 5, 'min_samples': 3, 'cluster_selection_epsilon': 0.3}
BERTopic Params  : {'top_n_words': 5, 'nr_topics': 3}
C_V Coherence Score: 0.6750

Topic Info:


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,48,-1_teman_kelas_orang_lingkungan,"[teman, kelas, orang, lingkungan, hal]","[materi materi enjoy belajar teman teman asik, teman teman lingkungan bantu pertemanan menyenangkan stres, menemukan teman teman asik lingkungan sehat materi dosen bermanfaat]"
1,0,247,0_teman_kampus_dosen_materi,"[teman, kampus, dosen, materi, belajar]","[perkuliahan lingkungan kampus nyaman kondusif belajar teman teman suportif aktif kegiatan pengalaman kuliah menyenangkan, dosen dosen kampus materi terstruktur teman teman mengerti materi mudah, teman teman mendukung membantu perkuliahan teman teman belajar bareng memahami materi]"
2,1,6,1_peluang_binus_mahasiswa_indonesia,"[peluang, binus, mahasiswa, indonesia, keuntungan]","[binus salah kampus ternama indonesia alasan kuliah pengajaran fasilitas magang, peluang peluang ditawarkan kampus pilihan makanan, peluang universitas mahasiswa mahasiswa peluang lapak berkembang]"



Barchart:



Document Visualization:


✓ Data untuk Model 305 telah ditambahkan ke Excel

✓ File Excel berhasil disimpan: bertopic_evaluation_positive_top5_models.xlsx
  File berisi 5 worksheet untuk evaluasi manual
  Setiap worksheet berisi informasi parameter model dan sample dokumen per topik


'bertopic_evaluation_positive_top5_models.xlsx'

In [48]:
visualize_top_models(top5_neg, "negative", texts_neg)


Top Model 787 (NEGATIVE Data)
Parameters:
Embedding Model  : sentence-transformers/multi-qa-MiniLM-L6-cos-v1
UMAP Params      : {'n_neighbors': 10, 'min_dist': 0.1, 'metric': 'cosine'}
HDBSCAN Params   : {'min_cluster_size': 5, 'min_samples': 3, 'cluster_selection_epsilon': 0.3}
BERTopic Params  : {'top_n_words': 5, 'nr_topics': 5}
C_V Coherence Score: 0.7270

Topic Info:


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,7,0_tissue_toilet_fasilitas_jorok,"[tissue, toilet, fasilitas, jorok, nyaman]","[fasilitas toilet tissue ruangan kelas panas kaca, fasilitas toilet kampus jorok nyaman tissue nyaman, toilet tissue jorok dikit malas toilet]"
1,1,51,1_toilet_tisu_fasilitas_kampus,"[toilet, tisu, fasilitas, kampus, wifi]","[fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus, fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus, fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus]"
2,2,243,2_kampus_dosen_mahasiswa_fasilitas,"[kampus, dosen, mahasiswa, fasilitas, wifi]","[perkuliahan fasilitas kampus device mumpuni mata kuliah spesifik dosen memiliki kompetensi mengajar mahasiswa termotivasi belajar berkembang, aksesibilitas kampus rumit nyaman menunggu mata kuliah nyaman koneksi wifi kampus sulit absen kuliah, fasilitas mahasiswa tugas kelas berukuran besar mengurangi interaksi dosen mahasiswa]"



Barchart:



Document Visualization:


✓ Data untuk Model 787 telah ditambahkan ke Excel

Top Model 347 (NEGATIVE Data)


No sentence-transformers model found with name indobenchmark/indobert-base-p1. Creating a new one with mean pooling.


Parameters:
Embedding Model  : indobenchmark/indobert-base-p1
UMAP Params      : {'n_neighbors': 5, 'min_dist': 0.1, 'metric': 'euclidean'}
HDBSCAN Params   : {'min_cluster_size': 5, 'min_samples': 3, 'cluster_selection_epsilon': 0.5}
BERTopic Params  : {'top_n_words': 5, 'nr_topics': 5}
C_V Coherence Score: 0.7190

Topic Info:


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,7,-1_dosen_layak_seenak_kp,"[dosen, layak, seenak, kp, jadwal]","[dosen layak dosen kapasitas kantin besar, perkuliahan dosen ribet mempersulit mahasiswa dosen layak dosen, fasilitas toilet bersih tisu mencukupi dosen seenak kp jadwal kp menumpuk dosen]"
1,0,260,0_kampus_dosen_fasilitas_mahasiswa,"[kampus, dosen, fasilitas, mahasiswa, kelas]","[wifi kampus bermasalah fasilitas toilet bersih, berbicara hal dii sebenar dibenci fasilitas kampus ruang kelas toilet internet library pojok bersantai mendukung kehidupan perkuliahan perfect bayangkan kayak diluar negeri kampus kurikulum materi belajar gak relevan ppt singkat informatif dipakai belajar persiapan ujian kunci inti ilmu dosen ketemu dosen baca ppt menyuruh murid materi ppt pertemuan gak efektif untung sesi kayak responsi pengalaman terburuk skripsi dospem undi otomatis by system jabatan susah dihubungi membimbing oh tambahan binus susah ijin susulan kebanyakan temen memilih pakai jatah absen terpakai semester pendek paham disiplin oiya kelas pengganti kampus dosen mengabari hadir ssc detik detik parkiran akses jalan binus kemanggisan jujur sempit pelajaran pindah kampus kanapa ga bangun gedung fakultas gtu benak delain ruang terbuka outdoor bersantai ga taman danau kayak univ gtu wkwkw, dosen dosen memahami materi mengajar sulit mengajarkan mahasiswa]"
2,1,20,1_wifi_internet_susah_terkoneksi,"[wifi, internet, susah, terkoneksi, kampus]","[wifi kampus terhubung hal internet susah, wifi kampus susah diakses kuota mengakses internet, wifi sulit terkoneksi susah internet stabil]"
3,2,8,2_jam_kuliah_mata_jadwal,"[jam, kuliah, mata, jadwal, pagi]","[jam mata kuliah gap mata kuliah mata kuliah bolak kampus, mata kuliah bersifat general kampus mempelajari mendaftar kampus belajar jam jam mata kuliah, jadwal mata kuliah pagi jam kuliah pagi jam pagi masuk kelas jam]"
4,3,6,3_lancar_bersih_mencukupi_fasilitas,"[lancar, bersih, mencukupi, fasilitas, tisu]","[fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus, fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus, fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus]"



Barchart:



Document Visualization:


✓ Data untuk Model 347 telah ditambahkan ke Excel

Top Model 650 (NEGATIVE Data)
Parameters:
Embedding Model  : sentence-transformers/multi-qa-MiniLM-L6-cos-v1
UMAP Params      : {'n_neighbors': 5, 'min_dist': 0.1, 'metric': 'euclidean'}
HDBSCAN Params   : {'min_cluster_size': 10, 'min_samples': 3, 'cluster_selection_epsilon': 0.3}
BERTopic Params  : {'top_n_words': 5, 'nr_topics': 4}
C_V Coherence Score: 0.7130

Topic Info:


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,14,-1_tissue_toilet_deadline_stress,"[tissue, toilet, deadline, stress, fasilitas]","[lift rusak menyebabkan antrian disaat jam kelas fasilitas toilet menyediakan tissue, fasilitas toilet kampus jorok nyaman tissue nyaman, toilet tissue jorok dikit malas toilet]"
1,0,236,0_kampus_dosen_mahasiswa_fasilitas,"[kampus, dosen, mahasiswa, fasilitas, wifi]","[fasilitas mahasiswa tugas kelas berukuran besar mengurangi interaksi dosen mahasiswa, aksesibilitas kampus rumit nyaman menunggu mata kuliah nyaman koneksi wifi kampus sulit absen kuliah, dosen dosen memahami materi mengajar sulit mengajarkan mahasiswa]"
2,1,23,1_toilet_tisu_bersih_fasilitas,"[toilet, tisu, bersih, fasilitas, pressure]","[dosen ramah toilet lantai kotor toilet gedung bersih toilet gedung penuh, fasilitas kuliah toilet parkir toilet bersih kekurangan tisu parkir teratur penuh, fasilitas toilet disediakan tisu toilet bersih kantin kampus antrian beli makan]"
3,2,28,2_toilet_wifi_tisu_kampus,"[toilet, wifi, tisu, kampus, fasilitas]","[fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus, fasilitas toilet tisu absen wifi terkoneksi, fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus]"



Barchart:



Document Visualization:


✓ Data untuk Model 650 telah ditambahkan ke Excel

Top Model 313 (NEGATIVE Data)


No sentence-transformers model found with name indobenchmark/indobert-base-p1. Creating a new one with mean pooling.


Parameters:
Embedding Model  : indobenchmark/indobert-base-p1
UMAP Params      : {'n_neighbors': 7, 'min_dist': 0.0, 'metric': 'cosine'}
HDBSCAN Params   : {'min_cluster_size': 10, 'min_samples': 3, 'cluster_selection_epsilon': 0.3}
BERTopic Params  : {'top_n_words': 5, 'nr_topics': 3}
C_V Coherence Score: 0.6900

Topic Info:


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,25,-1_kelas_tugas_hal_dosen,"[kelas, tugas, hal, dosen, perkuliahan]","[hal i perkuliahan tugas bersamaan mata kuliah kondisi kewalahan mengatur energi diselesaikan sisi melatih manajemen ketahanan mental, jadwal kuliah berubah mendadak akibat dosen hadir kesulitan mengatur kegiatan perkuliahan kelas padat melelahkan mengurangi fokus mengikuti mata kuliah, perkuliahan hal i jadwal perkuliahan padat tugas tugas menumpuk bersamaan hal sulit membagi optimal mata kuliah penyampaian materi menarik aplikatif proses belajar menyenangkan berusaha mengambil pelajaran pengalaman]"
1,0,168,0_kampus_wifi_fasilitas_toilet,"[kampus, wifi, fasilitas, toilet, tisu]","[fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus, fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus, fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus]"
2,1,108,1_dosen_materi_kuliah_mahasiswa,"[dosen, materi, kuliah, mahasiswa, mata]","[dosen materi menarik dosen penjelasan tugas, dosen materi materi materi dosen materi hal menghambat belajar berpengaruh hasil ujian, dosen dosen memahami materi mengajar sulit mengajarkan mahasiswa]"



Barchart:



Document Visualization:


✓ Data untuk Model 313 telah ditambahkan ke Excel

Top Model 346 (NEGATIVE Data)


No sentence-transformers model found with name indobenchmark/indobert-base-p1. Creating a new one with mean pooling.


Parameters:
Embedding Model  : indobenchmark/indobert-base-p1
UMAP Params      : {'n_neighbors': 5, 'min_dist': 0.1, 'metric': 'euclidean'}
HDBSCAN Params   : {'min_cluster_size': 5, 'min_samples': 3, 'cluster_selection_epsilon': 0.5}
BERTopic Params  : {'top_n_words': 5, 'nr_topics': 4}
C_V Coherence Score: 0.6810

Topic Info:


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,10,-1_tugas_menyebabkan_koneksi_berdekatan,"[tugas, menyebabkan, koneksi, berdekatan, deadline]","[kemacetan area kampus menyebabkan keterlambatan koneksi wifi absen mengganggu proses belajar, wifi disediakan kampus koneksi jaringan kesulitan tugas absen, deadline tugas berdekatan hal menyebabkan stress begadang menyelesaikan tugas]"
1,0,267,0_kampus_dosen_fasilitas_mahasiswa,"[kampus, dosen, fasilitas, mahasiswa, toilet]","[berbicara hal dii sebenar dibenci fasilitas kampus ruang kelas toilet internet library pojok bersantai mendukung kehidupan perkuliahan perfect bayangkan kayak diluar negeri kampus kurikulum materi belajar gak relevan ppt singkat informatif dipakai belajar persiapan ujian kunci inti ilmu dosen ketemu dosen baca ppt menyuruh murid materi ppt pertemuan gak efektif untung sesi kayak responsi pengalaman terburuk skripsi dospem undi otomatis by system jabatan susah dihubungi membimbing oh tambahan binus susah ijin susulan kebanyakan temen memilih pakai jatah absen terpakai semester pendek paham disiplin oiya kelas pengganti kampus dosen mengabari hadir ssc detik detik parkiran akses jalan binus kemanggisan jujur sempit pelajaran pindah kampus kanapa ga bangun gedung fakultas gtu benak delain ruang terbuka outdoor bersantai ga taman danau kayak univ gtu wkwkw, wifi kampus bermasalah fasilitas toilet bersih, nyaman fasilitas toilet minim air nyaman sistem pembelajaran mata kuliah menurunkan efektivitas pembelajaran jadwal perkuliahan berubah sistem penjadwalan kuliah disesuaikan dosen]"
2,1,18,1_wifi_internet_kampus_terkoneksi,"[wifi, internet, kampus, terkoneksi, susah]","[wifi kampus terhubung hal internet susah, wifi kampus susah diakses kuota mengakses internet, wifi sulit terkoneksi susah internet stabil]"
3,2,6,2_lancar_bersih_fasilitas_mencukupi,"[lancar, bersih, fasilitas, mencukupi, tisu]","[fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus, fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus, fasilitas toilet bersih tisu mencukupi wifi terkoneksi tugas lancar kampus]"



Barchart:



Document Visualization:


✓ Data untuk Model 346 telah ditambahkan ke Excel

✓ File Excel berhasil disimpan: bertopic_evaluation_negative_top5_models.xlsx
  File berisi 5 worksheet untuk evaluasi manual
  Setiap worksheet berisi informasi parameter model dan sample dokumen per topik


'bertopic_evaluation_negative_top5_models.xlsx'

In [51]:
positive_top5_results = pd.read_excel("results/bertopic_evaluation_positive_top5_models.xlsx")
negative_top5_results = pd.read_excel("results/bertopic_evaluation_negative_top5_models.xlsx")

## Try with best models

In [51]:
def build_and_predict_topics(texts, df, sentiment_value, **params):
    df_filtered = df[df['sentiment'] == sentiment_value][['cleaned_text']].copy()

    umap_model = UMAP(
        n_neighbors=params["n_neighbors"],
        min_dist=params["min_dist"],
        metric=params["umap_metric"],
        random_state=42
    )

    hdbscan_model = hdbscan.HDBSCAN(
        min_cluster_size=params["min_cluster_size"],
        min_samples=params["min_samples"],
        cluster_selection_epsilon=params["cluster_selection_epsilon"],
        prediction_data=True
    )

    embedding_model = SentenceTransformer(
        params["embedding_model_name"],
    )

    # Create BERTopic model
    best_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        top_n_words=params["top_n_words"],
        nr_topics=params["nr_topics"],
        verbose=True,
    )

    # Train
    topics, probs = best_model.fit_transform(texts)

    # Build predictions DataFrame
    df_pred = df_filtered.copy()
    df_pred['topic'] = topics
    df_pred['topic_proba'] = probs if probs is not None else None

    # Add topic descriptions (include outliers)
    topic_info = best_model.get_topic_info()
    df_pred = df_pred.merge(
        topic_info[["Topic", "Name"]],
        left_on="topic",
        right_on="Topic",
        how="left"
    ).drop(columns=["Topic"]).rename(columns={"Name": "topic_name"})

    return best_model, df_pred

### Positive

indobenchmark/indobert-base-p1	{'n_neighbors': 15, 'min_dist': 0.3, 'metric': 'euclidean'}	{'min_cluster_size': 10, 'min_samples': 3, 'cluster_selection_epsilon': 0.3}	{'top_n_words': 5, 'nr_topics': 6}

#### Alice: 

In [13]:
config = {
    "embedding_model_name": "indobenchmark/indobert-base-p1",
    "n_neighbors": 5,
    "min_dist": 0.0,
    "umap_metric": "cosine",
    "min_cluster_size": 5,
    "min_samples": 3,
    "cluster_selection_epsilon": 0.5,
    "top_n_words": 5,
    "nr_topics": 4,
}

positive_best_model, df_positive_pred = build_and_predict_topics(texts_pos, df_modeling, sentiment_value=1, **config)

No sentence-transformers model found with name indobenchmark/indobert-base-p1. Creating a new one with mean pooling.


ValueError: Due to a serious vulnerability issue in `torch.load`, even with `weights_only=True`, we now require users to upgrade torch to at least v2.6 in order to use the function. This version restriction does not apply when loading files with safetensors.
See the vulnerability report here https://nvd.nist.gov/vuln/detail/CVE-2025-32434

In [61]:
df_positive_pred['topic_name'].value_counts()

topic_name
2_teman_kampus_dosen_materi             285
0_diskusi_kelas_suasana_kesempatan        9
1_ujian_teman_bareng_nilai                5
-1_menyesuaikan_mal_iringi_disekitar      2
Name: count, dtype: int64

In [67]:
df_positive_pred.to_csv(
    'results/df_positive_pred.csv',
    index=False
)

#### Brans: 

In [22]:
config = {
    "embedding_model_name": "indobenchmark/indobert-base-p1",
    "n_neighbors": 15,
    "min_dist": 0.3,
    "umap_metric": "euclidean",
    "min_cluster_size": 10,
    "min_samples": 3,
    "cluster_selection_epsilon": 0.3,
    "top_n_words": 5,
    "nr_topics": 6,
}

positive_best_model, df_positive_pred = build_and_predict_topics(texts_pos, df_modeling, sentiment_value=1, **config)

No sentence-transformers model found with name indobenchmark/indobert-base-p1. Creating a new one with mean pooling.
2025-08-14 07:05:18,376 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

2025-08-14 07:07:26,855 - BERTopic - Embedding - Completed ✓
2025-08-14 07:07:26,856 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-08-14 07:07:33,248 - BERTopic - Dimensionality - Completed ✓
2025-08-14 07:07:33,249 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-14 07:07:33,424 - BERTopic - Cluster - Completed ✓
2025-08-14 07:07:33,426 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-08-14 07:07:33,504 - BERTopic - Representation - Completed ✓
2025-08-14 07:07:33,505 - BERTopic - Topic reduction - Reducing number of topics
2025-08-14 07:07:33,506 - BERTopic - Topic reduction - Number of topics (6) is equal or higher than the clustered topics(3).
2025-08-14 07:07:33,507 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-08-14 07:07:45,143 - BERTopic - Representation - Completed ✓


In [23]:
df_positive_pred['topic_name'].value_counts()

topic_name
1_teman_dosen_materi_kampus           249
0_fasilitas_kampus_lengkap_makanan     28
-1_teman_fasilitas_makanan_kampus      24
Name: count, dtype: int64

In [24]:
df_positive_pred.to_csv(
    'results/df_positive_pred2.csv',
    index=False
)

In [26]:
import os

save_path = "src/model/bertopic/best_positive_model2"

os.makedirs(os.path.dirname(save_path), exist_ok=True)
positive_best_model.save(save_path)



### Negative

sentence-transformers/multi-qa-MiniLM-L6-cos-v1	{'n_neighbors': 5, 'min_dist': 0.0, 'metric': 'cosine'}	{'min_cluster_size': 5, 'min_samples': 3, 'cluster_selection_epsilon': 0.3}	{'top_n_words': 5, 'nr_topics': 5}

In [52]:
config = {
    "embedding_model_name": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
    "n_neighbors": 5,
    "min_dist": 0.0,
    "umap_metric": "cosine",
    "min_cluster_size": 5,
    "min_samples": 3,
    "cluster_selection_epsilon": 0.3,
    "top_n_words": 5,
    "nr_topics": 5,
}

negative_best_model, df_negative_pred = build_and_predict_topics(
    texts_neg, 
    df_modeling, 
    sentiment_value=0, 
    **config
)

2025-08-18 20:48:08,257 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 10/10 [00:08<00:00,  1.19it/s]
2025-08-18 20:48:16,706 - BERTopic - Embedding - Completed ✓
2025-08-18 20:48:16,708 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-08-18 20:48:17,463 - BERTopic - Dimensionality - Completed ✓
2025-08-18 20:48:17,474 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-18 20:48:17,571 - BERTopic - Cluster - Completed ✓
2025-08-18 20:48:17,574 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-08-18 20:48:17,733 - BERTopic - Representation - Completed ✓
2025-08-18 20:48:17,738 - BERTopic - Topic reduction - Reducing number of topics
2025-08-18 20:48:17,788 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-08-18 20:48:17,879 - BERTopic - Representation - Completed ✓
2025-08-18 20:48:17,890 - BERTopic - Topic reduction - Redu

In [20]:
df_negative_pred['topic_name'].value_counts()

topic_name
0_kampus_dosen_mahasiswa_fasilitas         244
1_toilet_tisu_fasilitas_kampus              40
2_toilet_wifi_gedung_lambat                  9
3_tissue_toilet_jorok_fasilitas              7
-1_staf_konsultasi_diandalkan_konsisten      1
Name: count, dtype: int64

In [21]:
df_negative_pred.to_csv(
    'results/df_negative_pred.csv',
    index=False
)

### Save Best Model

In [53]:
import os

save_path = "src/models/bertopic/best_negative_model_try2"
os.makedirs(os.path.dirname(save_path), exist_ok=True)

embedding_model = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
negative_best_model.save(
    save_path, 
    serialization="safetensors", 
    save_ctfidf=True, 
    save_embedding_model=embedding_model
)

## Predict New Data

In [35]:
from utils.topic_prediction import prepare_dataset

In [54]:
def load_and_predict_topics(texts, model_path, df, sentiment_label):
    absolute_save_path = os.path.abspath(model_path)
    model = BERTopic.load(absolute_save_path)

    topics, probs = model.transform(texts)

    df_pred = df[df['sentiment'] == sentiment_label].copy()
    df_pred["topic"] = topics
    df_pred["topic_proba"] = probs

    topic_info = model.get_topic_info()
    df_pred = df_pred.merge(
        topic_info[["Topic", "Name"]],
        left_on="topic",
        right_on="Topic",
        how="left"
    ).drop(columns=["Topic"]).rename(columns={"Name": "topic_name"})

    return df_pred

In [38]:
from utils.sentiment_prediction import *

df_new = pd.read_csv(
    'src/data/dummy_test_data.csv'
)

df_new['sentiment'] = df_new['sentiment'].map({'positive': 1, 'negative': 0})
df_new['cleaned_text'] = df_new['text'].apply(text_cleansing)

new_texts_pos, new_texts_neg = prepare_dataset(df_new)

In [12]:
# Load and predict
df_new_pos_pred = load_and_predict_topics(
    texts=new_texts_pos,
    model_path="src/model/bertopic/best_positive_model",
    df=df_new,
    sentiment_label=1
)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
df_new_pos_pred['topic_name'].value_counts()

topic_name
-1_menyesuaikan_mal_iringi_disekitar    46
Name: count, dtype: int64

In [27]:
# Load and predict
df_new_pos_pred = load_and_predict_topics(
    texts=new_texts_pos,
    model_path="src/model/bertopic/best_positive_model2",
    df=df_new,
    sentiment_label=1
)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2025-08-14 07:10:28,142 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-08-14 07:10:28,259 - BERTopic - Dimensionality - Completed ✓
2025-08-14 07:10:28,260 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-08-14 07:10:28,263 - BERTopic - Cluster - Completed ✓


In [28]:
df_new_pos_pred['topic_name'].value_counts()

topic_name
-1_teman_fasilitas_makanan_kampus     27
1_teman_dosen_materi_kampus           16
0_fasilitas_kampus_lengkap_makanan     3
Name: count, dtype: int64

In [57]:
save_path = "src/models/bertopic/best_negative_model"
# Convert to absolute path to avoid relative path issues
absolute_save_path = os.path.abspath(save_path)
best = BERTopic.load(absolute_save_path)

In [58]:
# Load and predict
df_new_neg_pred = load_and_predict_topics(
    texts=new_texts_neg,
    model_path=absolute_save_path,
    df=df_new,
    sentiment_label=0
)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
2025-08-18 22:06:33,992 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.


IndexError: pop from empty list

**From tuning:**

[['kampus', 'fasilitas', 'wifi', 'dosen', 'toilet'], 

['materi', 'dosen', 'tugas', 'diajarkan', 'memuaskan'], 

['dosen', 'mengajar', 'mengerti', 'materi', 'mengajarkan'], 

['tissue', 'toilet', 'jorok', 'fasilitas', 'nyaman']]

In [56]:
df_new_neg_pred['topic_name'].value_counts()

topic_name
0_kampus_dosen_mahasiswa_fasilitas    15
1_toilet_tisu_fasilitas_kampus         2
2_toilet_wifi_gedung_lambat            2
3_tissue_toilet_jorok_fasilitas        1
Name: count, dtype: int64