## Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import time
import random

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan

import nlp_id
from nlp_id.lemmatizer import Lemmatizer
from nlp_id.tokenizer import Tokenizer
from nlp_id.stopword import StopWord
import stanza
import Sastrawi
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch

if torch.cuda.is_available():
    print("CUDA is available!")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA NOT Available")

CUDA is available!
Number of GPUs: 1
GPU 0: NVIDIA GeForce RTX 3050 Laptop GPU


In [3]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [4]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.chdir("C:/BCA/BERTopic/runs")  

In [5]:
df_modeling = pd.read_csv('src/data/df_modeling_BERT.csv')
df_modeling['sentiment'] = df_modeling['sentiment'].map({'positive': 1, 'negative': 0})
df_modeling.head()

Unnamed: 0,cleaned_text,sentiment
0,saya suka materi yang sudah disiapkan oleh pihak kampus karena memudahkan mahasiswa saya juga menyukai program enrichment yang disediakan kampus saya sehingga mahasiswa dapat belajar di ruang lingkup yang lebih luas,1
1,bisa bertemu dengan teman teman baru dan mendapatkan koneksi serta mendapatkan pelajaran yang berguna bagi saya kedepan nya,1
2,saya suka dengan makanan yang ada di dalam kampus saya terutama bakmi efata selain itu disekitar kampus juga banyak makanan enak,1
3,fasilitas kampus alam sutera sangat bagus pelajaran lab diajarkan oleh asisten yang sangat mengerti materi,1
4,saya suka dengan pertemanan nya solid mau saling bantu satu sama lain bagi bagi kisi kisi pas ujian terus saling ngajarin,1


In [6]:
from src.dictionary.exclude_words import exclude_stopwords

stopword = StopWord()
tokenizer = Tokenizer()

# Menambahkan kata untuk stop words
stop_words = stopword.get_stopword()
stop_words.append(exclude_stopwords)

def clean_text(text, negation=True):
  if negation:
      for phrase in ['sangat tidak menyukai', 'tidak menyukai','sangat tidak suka', 'tidak suka', 'kurang suka', 'kurang menyukai', 'ga suka', 'gak suka', 'ga menyukai', 'gak menyukai']:
          text = text.replace(phrase, '')

  for phrase in ['suka', 'sangat suka', 'menyukai', 'sangat menyukai']:
      text = text.replace(phrase, '')

  text = re.sub(r'\s+', ' ', text).strip()
  return text

def text_preprocessing(text):

  # # Tokenisasi menggunakan Tokenizer dari nlp_id
  tokens = tokenizer.tokenize(text)

  filtered_tokens = [word for word in tokens if word not in stop_words]

  # Menghapus spasi yang berlebih
  text = re.sub(r'\s+', ' ', text).strip()

  processed_text = " ".join(filtered_tokens)

  return processed_text

def prepare_dataset(df_modeling):
    df_pos = df_modeling[df_modeling['sentiment'] == 1].copy()
    df_neg = df_modeling[df_modeling['sentiment'] == 0].copy()

    df_pos['cleaned_text'] = df_pos['cleaned_text'].apply(clean_text, negation=False)
    df_neg['cleaned_text'] = df_neg['cleaned_text'].apply(clean_text, negation=True)

    df_pos['processed_text'] = df_pos['cleaned_text'].apply(text_preprocessing)
    df_neg['processed_text'] = df_neg['cleaned_text'].apply(text_preprocessing)

    text_pos = df_pos['processed_text'].astype(str).tolist()
    text_neg = df_neg['processed_text'].astype(str).tolist()

    return text_pos, text_neg

In [7]:
texts_pos, texts_neg = prepare_dataset(df_modeling)

## Predict New Data

In [8]:
def load_and_predict_topics(model_path, df, text_column, embeddings):
    """Load BERTopic model and predict topics for given dataframe"""
    model = BERTopic.load(model_path)
    docs = df[text_column].tolist()

    print("Has embedding model:", hasattr(model, "embedding_model"))
    print("UMAP expects n_features:", getattr(model.umap_model, "n_features_in_", "Unknown"))
    print(dir(model.umap_model))
    topics, probs = model.transform(docs, embeddings)

    df_pred = df.copy()
    df_pred["topic"] = topics
    df_pred["topic_proba"] = probs

    topic_info = model.get_topic_info()
    df_pred = df_pred.merge(
        topic_info[["Topic", "Name"]],
        left_on="topic",
        right_on="Topic",
        how="left"
    ).drop(columns=["Topic"]).rename(columns={"Name": "topic_name"})

    return df_pred

In [9]:
def predict_topics_by_sentiment(df, positive_model_path, negative_model_path, text_column='processed_text'):
    """
    Predict topics for both positive and negative sentiment data using respective models
    
    Args:
        df: DataFrame with sentiment column (1 for positive, 0 for negative)
        positive_model_path: Path to the positive sentiment BERTopic model
        negative_model_path: Path to the negative sentiment BERTopic model
        text_column: Column name containing the text to analyze
    
    Returns:
        DataFrame with topic predictions for both sentiments
    """
    
    # Separate positive and negative data
    df_pos = df[df['sentiment'] == 1].copy().reset_index()  # Preserve index as a column
    df_neg = df[df['sentiment'] == 0].copy().reset_index()  # Preserve index as a column
    
    print(f"Processing {len(df_pos)} positive sentiment texts...")
    print(f"Processing {len(df_neg)} negative sentiment texts...")
    
    results = []
    
    # Process positive sentiment data
    if len(df_pos) > 0:
        print("\n--- Processing Positive Sentiment Data ---")
        embedding_model_pos = SentenceTransformer("indobenchmark/indobert-base-p1")
        embedding_pos = embedding_model_pos.encode(df_pos[text_column].tolist(), show_progress_bar=True)
        df_pos_pred = load_and_predict_topics(
            model_path=positive_model_path,
            df=df_pos,
            text_column=text_column,
            embeddings=embedding_pos
        )
        df_pos_pred['sentiment_type'] = 'positive'
        results.append(df_pos_pred)
        print(f"Positive topics found: {df_pos_pred['topic'].nunique()}")
        print("Topic distribution (positive):")
        print(df_pos_pred['topic_name'].value_counts().head())
    
    # Process negative sentiment data
    if len(df_neg) > 0:
        print("\n--- Processing Negative Sentiment Data ---")
        embedding_model_neg = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
        embedding_neg = embedding_model_neg.encode(df_neg[text_column].tolist(), show_progress_bar=True)
        df_neg_pred = load_and_predict_topics(
            model_path=negative_model_path,
            df=df_neg,
            text_column=text_column,
            embeddings=embedding_neg
        )
        df_neg_pred['sentiment_type'] = 'negative'
        results.append(df_neg_pred)
        print(f"Negative topics found: {df_neg_pred['topic'].nunique()}")
        print("Topic distribution (negative):")
        print(df_neg_pred['topic_name'].value_counts().head())
    
    # Combine results
    if results:
        df_combined = pd.concat(results, ignore_index=False)  # Preserve index
        # Sort by the preserved 'index' column
        df_combined = df_combined.sort_values('index')
        return df_combined
    else:
        print("No data to process!")
        return pd.DataFrame()

In [10]:
df_new = pd.read_csv(
    'src/data/synthetic_100_reviews.csv'
)
df_new['sentiment'] = df_new['sentiment'].map({'positive': 1, 'negative': 0})
new_texts_pos, new_texts_neg = prepare_dataset(df_new)

In [11]:
# Apply preprocessing to the dataframe
df_new_processed = df_new.copy()

# Apply same preprocessing logic as in prepare_dataset
df_pos_temp = df_new_processed[df_new_processed['sentiment'] == 1].copy()
df_neg_temp = df_new_processed[df_new_processed['sentiment'] == 0].copy()

if len(df_pos_temp) > 0:
    df_pos_temp['cleaned_text'] = df_pos_temp['cleaned_text'].apply(clean_text, negation=False)
    df_pos_temp['processed_text'] = df_pos_temp['cleaned_text'].apply(text_preprocessing)

if len(df_neg_temp) > 0:
    df_neg_temp['cleaned_text'] = df_neg_temp['cleaned_text'].apply(clean_text, negation=True)
    df_neg_temp['processed_text'] = df_neg_temp['cleaned_text'].apply(text_preprocessing)

# Combine back
df_new_processed = pd.concat([df_pos_temp, df_neg_temp], ignore_index=True) if len(df_pos_temp) > 0 and len(df_neg_temp) > 0 else (df_pos_temp if len(df_pos_temp) > 0 else df_neg_temp)

In [12]:
# Predict topics using respective models
print("\n" + "="*50)
print("STARTING TOPIC PREDICTION")
print("="*50)

df_predictions = predict_topics_by_sentiment(
    df=df_new_processed,
    positive_model_path="src/model/bertopic/best_positive_model",
    negative_model_path="src/model/bertopic/best_negative_model",
    text_column='processed_text'
)

print("\n" + "="*50)
print("PREDICTION RESULTS SUMMARY")
print("="*50)

if len(df_predictions) > 0:
    print(f"Total predictions: {len(df_predictions)}")
    print(f"Positive sentiment: {len(df_predictions[df_predictions['sentiment'] == 1])}")
    print(f"Negative sentiment: {len(df_predictions[df_predictions['sentiment'] == 0])}")
    
    print("\nOverall topic distribution:")
    topic_dist = df_predictions.groupby(['sentiment_type', 'topic_name']).size().reset_index(name='count')
    print(topic_dist.sort_values('count', ascending=False))
    
    print("\nSample predictions:")
    print(df_predictions[['cleaned_text', 'sentiment_type', 'topic', 'topic_name', 'topic_proba']].head(10))
    
    # Save results
    output_path = 'results/topic_predictions.csv'
    df_predictions.to_csv(output_path, index=False)
    print(f"\nResults saved to: {output_path}")
    
    # Additional analysis
    print("\n" + "="*30)
    print("DETAILED ANALYSIS")
    print("="*30)
    
    # Top topics for each sentiment
    print("\nTop 5 topics for POSITIVE sentiment:")
    pos_topics = df_predictions[df_predictions['sentiment'] == 1]['topic_name'].value_counts().head()
    for topic, count in pos_topics.items():
        print(f"  {topic}: {count} documents")
    
    print("\nTop 5 topics for NEGATIVE sentiment:")
    neg_topics = df_predictions[df_predictions['sentiment'] == 0]['topic_name'].value_counts().head()
    for topic, count in neg_topics.items():
        print(f"  {topic}: {count} documents")
    
    # Average topic probability by sentiment
    print(f"\nAverage topic confidence:")
    avg_proba = df_predictions.groupby('sentiment_type')['topic_proba'].mean()
    for sentiment, prob in avg_proba.items():
        print(f"  {sentiment}: {prob:.3f}")
        
else:
    print("No predictions generated. Please check your data and model paths.")


STARTING TOPIC PREDICTION
Processing 46 positive sentiment texts...
Processing 54 negative sentiment texts...

--- Processing Positive Sentiment Data ---


No sentence-transformers model found with name indobenchmark/indobert-base-p1. Creating a new one with mean pooling.
Batches: 100%|██████████| 2/2 [00:01<00:00,  1.15it/s]


Has embedding model: True
UMAP expects n_features: Unknown
['__add__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__sklearn_clone__', '__sklearn_tags__', '__str__', '__sub__', '__subclasshook__', '__weakref__', '_a', '_b', '_build_request_for_signature', '_check_custom_metric', '_check_feature_names', '_check_n_features', '_densmap_kwds', '_disconnection_distance', '_doc_link_module', '_doc_link_template', '_doc_link_url_param_generator', '_fit_embed_data', '_get_default_requests', '_get_doc_link', '_get_metadata_request', '_get_param_names', '_get_tags', '_initial_alpha', '_input_distance_func', '_input_hash', '_inverse_distance_func', '_metric_kwds', '_more_tags', '_n_features_out', '_n_nei

Batches: 100%|██████████| 2/2 [00:00<00:00,  7.23it/s]


Has embedding model: True
UMAP expects n_features: Unknown
['__add__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__sklearn_clone__', '__sklearn_tags__', '__str__', '__sub__', '__subclasshook__', '__weakref__', '_a', '_b', '_build_request_for_signature', '_check_custom_metric', '_check_feature_names', '_check_n_features', '_densmap_kwds', '_disconnection_distance', '_doc_link_module', '_doc_link_template', '_doc_link_url_param_generator', '_fit_embed_data', '_get_default_requests', '_get_doc_link', '_get_metadata_request', '_get_param_names', '_get_tags', '_initial_alpha', '_input_distance_func', '_input_hash', '_inverse_distance_func', '_metric_kwds', '_more_tags', '_n_features_out', '_n_nei