# **Import Libraries**

In [1]:
import pandas as pd
import numpy as np
import re
import time
import random

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap.umap_ import UMAP
import hdbscan
import torch

import nlp_id
from nlp_id.tokenizer import Tokenizer
from nlp_id.stopword import StopWord

from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)

In [2]:
import os
os.chdir("..")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# **Prepare Dataset**

In [None]:
df_modeling = pd.read_csv('src/data/df_modeling_BERT.csv')
df_modeling.head()

Unnamed: 0,cleaned_text,sentiment
0,saya suka materi yang sudah disiapkan oleh pihak kampus karena memudahkan mahasiswa saya juga menyukai program enrichment yang disediakan kampus saya sehingga mahasiswa dapat belajar di ruang lingkup yang lebih luas,positive
1,bisa bertemu dengan teman teman baru dan mendapatkan koneksi serta mendapatkan pelajaran yang berguna bagi saya kedepan nya,positive
2,saya suka dengan makanan yang ada di dalam kampus saya terutama bakmi efata selain itu disekitar kampus juga banyak makanan enak,positive
3,fasilitas kampus alam sutera sangat bagus pelajaran lab diajarkan oleh asisten yang sangat mengerti materi,positive
4,saya suka dengan pertemanan nya solid mau saling bantu satu sama lain bagi bagi kisi kisi pas ujian terus saling ngajarin,positive


In [8]:
from utils.topic_prediction import prepare_dataset
texts_pos, texts_neg = prepare_dataset(df_modeling)

# **Multiple Tuning**

In [11]:
# embedding_models = [
#     "LazarusNLP/all-indo-e5-small-v4",
#     "paraphrase-multilingual-MiniLM-L12-v2",
#     "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
# ]

# umap_params = [
#     {"n_neighbors": 7, "min_dist": 0.0, "metric": "cosine"},
#     {"n_neighbors": 5, "min_dist": 0.1, "metric": "euclidean"},
#     {"n_neighbors": 5, "min_dist": 0.0, "metric": "cosine"},
#     {"n_neighbors": 7, "min_dist": 0.1, "metric": "euclidean"},
#     {"n_neighbors": 10, "min_dist": 0.1, "metric": "cosine"},
#     {"n_neighbors": 15, "min_dist": 0.3, "metric": "euclidean"}
# ]

# hdbscan_params = [
#     {"min_cluster_size": 10, "min_samples": 3, "cluster_selection_epsilon": 0.0},
#     {"min_cluster_size": 5, "min_samples": 3, "cluster_selection_epsilon": 0.5},
#     {"min_cluster_size": 5, "min_samples": 3, "cluster_selection_epsilon": 0.3},
#     {"min_cluster_size": 10, "min_samples": 3, "cluster_selection_epsilon": 0.3},
#     {"min_cluster_size": 15, "min_samples": 5, "cluster_selection_epsilon": 0.0},
#     {"min_cluster_size": 10, "min_samples": 5, "cluster_selection_epsilon": 0.5},
# ]

# bertopic_params = [
#     {"top_n_words": 5, "nr_topics": 3},
#     {"top_n_words": 5, "nr_topics": 4},
#     {"top_n_words": 5, "nr_topics": 5},
#     {"top_n_words": 5, "nr_topics": 6},
#     {"top_n_words": 10, "nr_topics": 3},
#     {"top_n_words": 10, "nr_topics": 4},
#     {"top_n_words": 10, "nr_topics": 5},
#     {"top_n_words": 10, "nr_topics": 6},
# ]

In [10]:
embedding_models = [
    "LazarusNLP/all-indo-e5-small-v4"  # best Indo-focused choice
]

umap_params = [
    {"n_neighbors": 5, "min_dist": 0.0, "metric": "cosine"},
]

hdbscan_params = [
    {"min_cluster_size": 5, "min_samples": 3, "cluster_selection_epsilon": 0.0},
]

bertopic_params = [
    {"top_n_words": 5, "nr_topics": 4},
]

In [16]:
from utils.topic_evaluation import calculate_coherence_score, calculate_irbo, evaluate_topics

def run_topic_modeling(label, top_k=3):
    all_start_time = time.time()

    texts = texts_pos if label == "positive" else texts_neg
    results = []

    for emb in embedding_models:
        embedding_model = SentenceTransformer(emb)

        for u in umap_params:
            umap_model = None if u is None else UMAP(**{**u, "random_state": 42})

            for h in hdbscan_params:
                hdbscan_model = None if h is None else hdbscan.HDBSCAN(**h)

                for b in bertopic_params:
                    model_start_time = time.time()

                    topic_model = BERTopic(
                        embedding_model=embedding_model,
                        umap_model=umap_model,
                        hdbscan_model=hdbscan_model,
                        top_n_words=b["top_n_words"],
                        nr_topics=b["nr_topics"],
                        verbose=False
                    )

                    topic_model.fit(texts)
                    coherence_scores, irbo_score = evaluate_topics(texts, topic_model)

                    topics_dict = topic_model.get_topics()
                    topics_filtered = {k: v for k, v in topics_dict.items() if k != -1 and v is not None}

                    model_time = round(time.time() - model_start_time, 2)
                    
                    results.append({
                        "Label": label,
                        "Embedding Model": emb,
                        "UMAP": u,
                        "HDBSCAN": h,
                        "BERTopic Params": b,
                        "Num Topics": len(topics_filtered),
                        "Topics List": [[word for word, _ in words] for words in topics_filtered.values()],
                        **coherence_scores,
                        "IRBO": irbo_score,
                        "Model Time (s)": model_time
                    })

                    print(f"[✓] Finished model - Embedding: {emb}, UMAP: {u}, HDBSCAN: {h}, BERTopic Params: {b} | Time: {model_time}s")

    total_time = round(time.time() - all_start_time, 2)
    print(f"\nAll models completed in {total_time} seconds.")
    results_topk = pd.DataFrame(results).sort_values(by='c_v', ascending=False).head(top_k)

    for i in range(len(results_topk)):
        emb = results_topk['Embedding Model'].iloc[i]
        u = results_topk['UMAP'].iloc[i]
        h = results_topk['HDBSCAN'].iloc[i]
        b = results_topk['BERTopic Params'].iloc[i]

        embedding_model = SentenceTransformer(emb)
        umap_model = None if u is None else UMAP(**{**u, "random_state": 42})
        hdbscan_model = None if h is None else hdbscan.HDBSCAN(**h)

        bertopic_model = BERTopic(
            embedding_model=embedding_model,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            top_n_words=b["top_n_words"],
            nr_topics=b["nr_topics"],
            verbose=False
        )

        bertopic_model.fit(texts)

        model_save_path = f'src/models/bertopic_top{top_k}/{label}/best_{label}_model_top{i+1}'
        embedding_save_path = f'src/models/bertopic_top{top_k}/{label}/embeddings_top{i+1}'

        embedding_model.save(embedding_save_path)
        bertopic_model.save(
            model_save_path,
            serialization="safetensors",
        )
        print(f"Model saved to {model_save_path}, embedding saved to {embedding_save_path}")

    return results

### **Positive**

In [13]:
positive_results = run_topic_modeling("positive")

Coherence (c_v): 0.577
Coherence (u_mass): -2.922
Coherence (c_uci): 0.053
Coherence (c_npmi): 0.129
IRBO Topic Diversity: 0.843
[✓] Finished model - Embedding: LazarusNLP/all-indo-e5-small-v4, UMAP: {'n_neighbors': 7, 'min_dist': 0.0, 'metric': 'cosine'}, HDBSCAN: {'min_cluster_size': 10, 'min_samples': 3, 'cluster_selection_epsilon': 0.0}, BERTopic Params: {'top_n_words': 5, 'nr_topics': 3} | Time: 31.73s
Coherence (c_v): 0.564
Coherence (u_mass): -2.453
Coherence (c_uci): 0.087
Coherence (c_npmi): 0.1
IRBO Topic Diversity: 0.874
[✓] Finished model - Embedding: LazarusNLP/all-indo-e5-small-v4, UMAP: {'n_neighbors': 7, 'min_dist': 0.0, 'metric': 'cosine'}, HDBSCAN: {'min_cluster_size': 10, 'min_samples': 3, 'cluster_selection_epsilon': 0.0}, BERTopic Params: {'top_n_words': 5, 'nr_topics': 4} | Time: 11.88s
Coherence (c_v): 0.574
Coherence (u_mass): -2.815
Coherence (c_uci): -0.254
Coherence (c_npmi): 0.094
IRBO Topic Diversity: 0.954
[✓] Finished model - Embedding: LazarusNLP/all-ind

In [None]:
df_pos = pd.DataFrame(positive_results)

df_pos["Topics List"] = df_pos["Topics List"].apply(lambda x: str(x))
save_path = 'results/bertopic_top3/'
os.makedirs(save_path, exist_ok=True)
df_pos.sort_values(by="c_v", ascending=False).to_excel(os.path.join(save_path, 'positive_topic_modeling_results.xlsx'), index=False)
df_pos.sort_values(by="c_v", ascending=False).to_csv(os.path.join(save_path, 'positive_topic_modeling_results.csv'), index=False)

### **Negative**

In [17]:
negative_results = run_topic_modeling("negative")

Coherence (c_v): 0.629
Coherence (u_mass): -1.69
Coherence (c_uci): 0.412
Coherence (c_npmi): 0.132
IRBO Topic Diversity: 1.0
[✓] Finished model - Embedding: LazarusNLP/all-indo-e5-small-v4, UMAP: {'n_neighbors': 7, 'min_dist': 0.0, 'metric': 'cosine'}, HDBSCAN: {'min_cluster_size': 10, 'min_samples': 3, 'cluster_selection_epsilon': 0.0}, BERTopic Params: {'top_n_words': 5, 'nr_topics': 3} | Time: 15.38s
Coherence (c_v): 0.624
Coherence (u_mass): -1.92
Coherence (c_uci): 0.476
Coherence (c_npmi): 0.132
IRBO Topic Diversity: 0.914
[✓] Finished model - Embedding: LazarusNLP/all-indo-e5-small-v4, UMAP: {'n_neighbors': 7, 'min_dist': 0.0, 'metric': 'cosine'}, HDBSCAN: {'min_cluster_size': 10, 'min_samples': 3, 'cluster_selection_epsilon': 0.0}, BERTopic Params: {'top_n_words': 5, 'nr_topics': 4} | Time: 13.1s
Coherence (c_v): 0.674
Coherence (u_mass): -1.802
Coherence (c_uci): 0.547
Coherence (c_npmi): 0.154
IRBO Topic Diversity: 0.922
[✓] Finished model - Embedding: LazarusNLP/all-indo-e5

In [None]:
df_neg = pd.DataFrame(negative_results)
df_neg["Topics List"] = df_neg["Topics List"].apply(lambda x: str(x))

save_path = 'results/bertopic_top3/'
os.makedirs(save_path, exist_ok=True)
df_neg.sort_values(by="c_v", ascending=False).to_excel(os.path.join(save_path, 'negative_topic_modeling_results.xlsx'), index=False)
df_neg.sort_values(by="c_v", ascending=False).to_csv(os.path.join(save_path, 'negative_topic_modeling_results.csv'), index=False)

## **Summary**

The top-3 models from each sentiment will later be evaluated by human judgement where it will be the final decision to pick the final model for each sentiment. 