# **Import Libraries**

In [1]:
import pandas as pd
import numpy as np
import re
import time
import random
import ast

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan
import torch

import nlp_id
from nlp_id.tokenizer import Tokenizer
from nlp_id.stopword import StopWord

from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.chdir("..")

In [3]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [None]:
MODEL_PATH = 'src/models/bertopic_top3'
SAVE_RESULT_PATH = 'results/bertopic_top3/inference_results'

# **Functions**

In [None]:
from utils.topic_prediction import *

In [20]:
def load_model(label, topk):
    path = os.path.join(MODEL_PATH, f'{label}/best_{label}_model_top{topk}')
    model_path = os.path.abspath(path)

    path = os.path.join(MODEL_PATH, label, f'embeddings_top{topk}')
    embedding = SentenceTransformer(path)

    model = BERTopic.load(model_path, embedding_model=embedding)
    return model

In [41]:
def run_topk_topic_modeling(texts, df, label_name, topk): 
    if label_name == 'positive': 
        sentiment = 1
    else: 
        sentiment = 0
        
    model = load_model(label_name, topk)
    
    # Predict topics
    df_pred = predict_topics(texts, df, model, sentiment)

    print(df_pred['topic_name'].value_counts())

    save_path = os.path.join(SAVE_RESULT_PATH, label_name, f"{label_name}_top{topk}_prediction_result.csv")
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    
    df_pred.to_csv(save_path, index=False)
    print(f"Prediction saved to {save_path}")
    
    return df_pred


# **Prepare Dataset**

In [9]:
data_path = 'src/data/df_modeling_BERT.csv'
df = pd.read_csv(data_path)

In [10]:
texts_pos, texts_neg = prepare_dataset(df)

# **Experiment**

## **Positive**

### **Model Top 1**

In [42]:
df_pred_top1 = run_topk_topic_modeling(
    texts=texts_pos, 
    df=df, 
    label_name="positive", 
    topk=1    
)

topic_name
0_teman_dosen_kampus_materi              219
1_bareng_belajar_teman_ujian              30
-1_event_teman_organisasi_kampus          21
3_kegiatan_program_acara_seru             18
2_program_mahasiswa_magang_enrichment     13
Name: count, dtype: int64
Prediction saved to results/bertopic_top3/inference_results/positive/positive_top1_prediction_result.csv


### **Model Top 2**

In [12]:
df_pred_top2 = run_topk_topic_modeling(
    texts=texts_pos, 
    df=df, 
    label_name="positive", 
    topk=2
)

topic_name
0_teman_kampus_dosen_materi              202
2_bareng_belajar_teman_ujian              28
1_bertemu_teman_orang_koneksi             26
-1_event_teman_organisasi_kampus          17
4_kegiatan_program_acara_seru             16
3_program_mahasiswa_magang_enrichment     12
Name: count, dtype: int64
Prediction saved to results/bertopic_top3/inference_results/positive/positive_top2_prediction_result.csv


### **Model Top 3**

In [13]:
df_pred_top3 = run_topk_topic_modeling(
    texts=texts_pos, 
    df=df, 
    label_name="positive", 
    topk=3)

topic_name
0_teman_kampus_dosen_materi              240
1_bareng_teman_ujian_bagus                24
3_kegiatan_program_acara_seru             16
2_program_mahasiswa_magang_enrichment     13
-1_artikel_event_organisasi_menulis        8
Name: count, dtype: int64
Prediction saved to results/bertopic_top3/inference_results/positive/positive_top3_prediction_result.csv


### **Model Top 4**

In [14]:
df_pred_top4 = run_topk_topic_modeling(
    texts=texts_pos, 
    df=df, 
    label_name="positive", 
    topk=4
)

topic_name
0_teman_dosen_kampus_materi          228
2_belajar_bareng_teman_ujian          30
-1_event_teman_organisasi_kampus      22
1_program_mahasiswa_magang_kampus     21
Name: count, dtype: int64
Prediction saved to results/bertopic_top3/inference_results/positive/positive_top4_prediction_result.csv


### **Model Top 5**

In [16]:
df_pred_top5 = run_topk_topic_modeling(
    texts=texts_pos, 
    df=df, 
    label_name="positive", 
    topk=5
)

topic_name
0_teman_kampus_dosen_materi              233
-1_kegiatan_ukm_materi_mahasiswa          48
1_organisasi_soft_kegiatan_pengalaman     20
Name: count, dtype: int64
Prediction saved to results/bertopic_top3/inference_results/positive/positive_top5_prediction_result.csv


## **Negative**

### **Model Top 1**

In [21]:
df_pred_top1 = run_topk_topic_modeling(
    texts=texts_neg, 
    df=df, 
    label_name="negative", 
    topk=1
)

topic_name
0_kampus_wifi_fasilitas_toilet       140
1_dosen_materi_mahasiswa_mengajar     75
-1_fasilitas_kampus_dosen_kuliah      61
2_kuliah_mata_pagi_jam                13
3_tugas_ubah_menumpuk_bersamaan       12
Name: count, dtype: int64
Prediction saved to results/bertopic_top3/inference_results/negative/negative_top1_prediction_result.csv


### **Model Top 2**

In [22]:
df_pred_top2 = run_topk_topic_modeling(
    texts=texts_neg, 
    df=df, 
    label_name="negative", 
    topk=2
)

topic_name
0_kampus_wifi_fasilitas_toilet       122
1_dosen_materi_mahasiswa_mengajar     75
-1_fasilitas_kampus_dosen_kuliah      51
2_toilet_tissue_fasilitas_tangan      29
3_kuliah_mata_pagi_jam                13
4_tugas_ubah_menumpuk_bersamaan       11
Name: count, dtype: int64
Prediction saved to results/bertopic_top3/inference_results/negative/negative_top2_prediction_result.csv


### **Model Top 3**

In [23]:
df_pred_top3 = run_topk_topic_modeling(
    texts=texts_pos, 
    df=df, 
    label_name="negative", 
    topk=3
)

topic_name
1_dosen_materi_mahasiswa_mengajar    198
-1_hal_orang_kuliah_panas             44
0_kampus_wifi_fasilitas_toilet        43
2_kuliah_jam_mata_pagi                15
3_toilet_tissue_tangan_fasilitas       1
Name: count, dtype: int64
Prediction saved to results/bertopic_top3/inference_results/negative/negative_top3_prediction_result.csv


### **Model Top 4**

In [24]:
df_pred_top4 = run_topk_topic_modeling(
    texts=texts_pos, 
    df=df, 
    label_name="negative", 
    topk=4
)

topic_name
1_dosen_materi_mahasiswa_hal          196
0_kampus_fasilitas_wifi_toilet         77
2_jadwal_kuliah_mata_pagi              18
-1_kristiani_orang_arsitek_lulusan     10
Name: count, dtype: int64
Prediction saved to results/bertopic_top3/inference_results/negative/negative_top4_prediction_result.csv


### **Model Top 5**

In [25]:
df_pred_top5 = run_topk_topic_modeling(
    texts=texts_pos, 
    df=df, 
    label_name="negative", 
    topk=5
)

topic_name
1_dosen_materi_mahasiswa_hal          196
0_kampus_fasilitas_wifi_toilet         77
2_jadwal_kuliah_mata_pagi              18
-1_kristiani_orang_arsitek_lulusan     10
Name: count, dtype: int64
Prediction saved to results/bertopic_top3/inference_results/negative/negative_top5_prediction_result.csv
