# **Import Libraries**

In [1]:
import pandas as pd
import numpy as np
import re
import time
import random

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan
import torch

import nlp_id
from nlp_id.tokenizer import Tokenizer
from nlp_id.stopword import StopWord

from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# os.chdir("/home/jovyan/serpens_testing/runs")  
os.chdir("/Users/alicia.siahaya/Documents/Alice Tiket 2025/Thesis_Modeling/Thesis/")

In [3]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [14]:
def load_model(model_path, embedding_name, label, num):
    path = os.path.join(model_path, f'{label}/best_{label}_model_top{num}')
    model_path = os.path.abspath(path)
    embedding = SentenceTransformer(embedding_name)

    model = BERTopic.load(model_path, embedding_model=embedding)
    return model

# **Load Data**

In [4]:
data_path = 'src/data/df_modeling_BERT.csv'
model_path = 'src/models/bertopic_top5/'

In [5]:
from utils.topic_prediction import prepare_dataset

df = pd.read_csv(data_path)
texts_pos, texts_neg = prepare_dataset(df)

In [17]:
pd.read_excel(
    '/Users/alicia.siahaya/Documents/Alice Tiket 2025/Thesis_Modeling/Thesis/results/bertopic_results/negative_topic_modeling_results.xlsx'
)

Unnamed: 0,Label,Embedding Model,UMAP,HDBSCAN,BERTopic Params,Num Topics,Topics List,c_v,u_mass,c_uci,c_npmi,IRBO
0,negative,paraphrase-multilingual-MiniLM-L12-v2,"{'n_neighbors': 7, 'min_dist': 0.0, 'metric': 'cosine'}","{'min_cluster_size': 10, 'min_samples': 3, 'cluster_selection_epsilon': 0.0}","{'top_n_words': 5, 'nr_topics': 3}",2,"[['dosen', 'mahasiswa', 'kampus', 'kuliah', 'materi'], ['wifi', 'toilet', 'kampus', 'fasilitas', 'tisu']]",0.580,-1.520,0.303,0.102,0.843
1,negative,paraphrase-multilingual-MiniLM-L12-v2,"{'n_neighbors': 7, 'min_dist': 0.0, 'metric': 'cosine'}","{'min_cluster_size': 10, 'min_samples': 3, 'cluster_selection_epsilon': 0.0}","{'top_n_words': 5, 'nr_topics': 4}",3,"[['dosen', 'mahasiswa', 'kuliah', 'materi', 'mata'], ['wifi', 'kampus', 'fasilitas', 'parkir', 'kelas'], ['toilet', 'tisu', 'fasilitas', 'kampus', 'wifi']]",0.594,-1.579,0.323,0.115,0.904
2,negative,paraphrase-multilingual-MiniLM-L12-v2,"{'n_neighbors': 7, 'min_dist': 0.0, 'metric': 'cosine'}","{'min_cluster_size': 10, 'min_samples': 3, 'cluster_selection_epsilon': 0.0}","{'top_n_words': 5, 'nr_topics': 5}",4,"[['dosen', 'kuliah', 'kampus', 'mahasiswa', 'hal'], ['toilet', 'tisu', 'fasilitas', 'kampus', 'bersih'], ['wifi', 'kampus', 'koneksi', 'susah', 'absensi'], ['dosen', 'ujian', 'pressure', 'materi', 'tugas']]",0.569,-3.956,-2.688,-0.007,0.868
3,negative,paraphrase-multilingual-MiniLM-L12-v2,"{'n_neighbors': 7, 'min_dist': 0.0, 'metric': 'cosine'}","{'min_cluster_size': 10, 'min_samples': 3, 'cluster_selection_epsilon': 0.0}","{'top_n_words': 5, 'nr_topics': 6}",5,"[['kampus', 'mahasiswa', 'kelas', 'kuliah', 'fasilitas'], ['toilet', 'tisu', 'fasilitas', 'kampus', 'bersih'], ['dosen', 'materi', 'belajar', 'tugas', 'mengajar'], ['wifi', 'kampus', 'koneksi', 'susah', 'absensi'], ['jadwal', 'deadline', 'pagi', 'tugas', 'dosen']]",0.548,-3.450,-1.507,0.037,0.939
4,negative,paraphrase-multilingual-MiniLM-L12-v2,"{'n_neighbors': 7, 'min_dist': 0.0, 'metric': 'cosine'}","{'min_cluster_size': 10, 'min_samples': 3, 'cluster_selection_epsilon': 0.0}","{'top_n_words': 10, 'nr_topics': 3}",2,"[['dosen', 'mahasiswa', 'kampus', 'kuliah', 'hal', 'materi', 'mata', 'tugas', 'kelas', 'perkuliahan'], ['wifi', 'toilet', 'kampus', 'fasilitas', 'tisu', 'bersih', 'terkoneksi', 'lambat', 'susah', 'kelas']]",0.476,-2.485,-0.539,0.039,0.847
...,...,...,...,...,...,...,...,...,...,...,...,...
571,negative,indobenchmark/indobert-base-p1,"{'n_neighbors': 15, 'min_dist': 0.3, 'metric': 'euclidean'}","{'min_cluster_size': 10, 'min_samples': 5, 'cluster_selection_epsilon': 0.5}","{'top_n_words': 5, 'nr_topics': 6}",2,"[['wifi', 'internet', 'kampus', 'terkoneksi', 'susah'], ['kampus', 'dosen', 'fasilitas', 'toilet', 'mahasiswa']]",0.559,-1.702,0.220,0.063,0.843
572,negative,indobenchmark/indobert-base-p1,"{'n_neighbors': 15, 'min_dist': 0.3, 'metric': 'euclidean'}","{'min_cluster_size': 10, 'min_samples': 5, 'cluster_selection_epsilon': 0.5}","{'top_n_words': 10, 'nr_topics': 3}",2,"[['wifi', 'internet', 'kampus', 'terkoneksi', 'susah', 'lambat', 'mengakses', 'absensi', 'kuota', 'daerah'], ['kampus', 'dosen', 'fasilitas', 'toilet', 'mahasiswa', 'kelas', 'kuliah', 'wifi', 'hal', 'tugas']]",0.309,-5.375,-2.368,-0.036,0.823
573,negative,indobenchmark/indobert-base-p1,"{'n_neighbors': 15, 'min_dist': 0.3, 'metric': 'euclidean'}","{'min_cluster_size': 10, 'min_samples': 5, 'cluster_selection_epsilon': 0.5}","{'top_n_words': 10, 'nr_topics': 4}",2,"[['kampus', 'dosen', 'fasilitas', 'toilet', 'mahasiswa', 'kelas', 'kuliah', 'wifi', 'hal', 'tugas'], ['wifi', 'internet', 'kampus', 'terkoneksi', 'lambat', 'mengakses', 'susah', 'absensi', 'daerah', 'sulit']]",0.320,-4.801,-2.148,-0.037,0.823
574,negative,indobenchmark/indobert-base-p1,"{'n_neighbors': 15, 'min_dist': 0.3, 'metric': 'euclidean'}","{'min_cluster_size': 10, 'min_samples': 5, 'cluster_selection_epsilon': 0.5}","{'top_n_words': 10, 'nr_topics': 5}",3,"[['wifi', 'internet', 'kampus', 'susah', 'terkoneksi', 'mengakses', 'kuota', 'daerah', 'lambat', 'stabil'], ['dosen', 'kuliah', 'materi', 'mata', 'mengajar', 'jadwal', 'mahasiswa', 'kelas', 'tugas', 'belajar'], ['kampus', 'fasilitas', 'toilet', 'wifi', 'mahasiswa', 'nyaman', 'tisu', 'hal', 'kelas', 'gedung']]",0.408,-4.112,-1.992,-0.010,0.893


In [10]:
path = 'src/models/bertopic_top5/positive/best_positive_model_top1'
save_path = os.path.abspath(path)
BERTopic.load(save_path)



<bertopic._bertopic.BERTopic at 0x7f95b3164a60>

In [16]:
embedding_name = 'indobenchmark/indobert-base-p1'
model_top1 = load_model(model_path, embedding_name, "positive", 1)

No sentence-transformers model found with name indobenchmark/indobert-base-p1. Creating a new one with mean pooling.


ValueError: Due to a serious vulnerability issue in `torch.load`, even with `weights_only=True`, we now require users to upgrade torch to at least v2.6 in order to use the function. This version restriction does not apply when loading files with safetensors.
See the vulnerability report here https://nvd.nist.gov/vuln/detail/CVE-2025-32434