# 1. Install and Import Libraries

In [None]:
# Run this first, then restart runtime
!pip install bertopic

In [None]:
!pip install gensim

In [None]:
!pip install Sastrawi

In [None]:
import os
import json
import math
import random
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
nltk.download('punkt')
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

from sentence_transformers import SentenceTransformer

from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

from umap import UMAP
from hdbscan import HDBSCAN

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

from wordcloud import WordCloud

import torch
from google.colab import drive, files

warnings.filterwarnings("ignore", category=DeprecationWarning)

# 2. Set Up Environment

## 2.1. Set Seed for Reproducibility

In [None]:
SEED = 42

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

if torch.cuda.is_available():
  torch.cuda.manual_seed_all(SEED)

## 2.2. Set GPU

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

## 2.3. Manage Google Drive

### 2.3.1. Mount Google Drive

In [None]:
drive.mount('/content/drive')

### 2.3.2. Set Folder & File Path

In [None]:
# Dataset & Stopwords File
EMOJI_WORDS_FILE_PATH = '/content/drive/My Drive/Thesis_NS/Dataset/emoji_words.csv'
NO_EMOJI_FILE_PATH = '/content/drive/My Drive/Thesis_NS/Dataset/no_emoji.csv'
STOPWORDS_PATH = '/content/drive/My Drive/Thesis_NS/tala-stopwords-indonesia.txt'

# Results
SAVE_ROOT = "/content/drive/MyDrive/Thesis_NS/Models/TM_Check"
EMBED_SAVE_ROOT = "/content/drive/MyDrive/Thesis_NS/Models/TM_Tuned_2"

## 2.4. Load Dataset

In [None]:
df_emoji_words = pd.read_csv(EMOJI_WORDS_FILE_PATH)
df_no_emoji = pd.read_csv(NO_EMOJI_FILE_PATH)

In [None]:
df_emoji_words = df_emoji_words.dropna(axis=1, how='all')
df_no_emoji = df_no_emoji.dropna(axis=1, how='all')

In [None]:
df_emoji_words = df_emoji_words.dropna(axis=0, how='all')
df_no_emoji = df_no_emoji.dropna(axis=0, how='all')

In [None]:
df_emoji_words = df_emoji_words.reset_index(drop=True)
df_no_emoji = df_no_emoji.reset_index(drop=True)

In [None]:
print(f"Emoji Dataset Rows: {len(df_emoji_words)}")
print(f"No Emoji Dataset Rows: {len(df_no_emoji)}")

print("\n--- Last 2 rows of Emoji Dataset ---")
print(df_emoji_words.tail(2))

print("\n--- Last 2 rows of No Emoji Dataset ---")
print(df_no_emoji.tail(2))

In [None]:
assert len(df_emoji_words) == len(df_no_emoji), "Datasets must have the same number of rows!"

In [None]:
sentiments = ["positive", "negative"]

# 3. Topic Modelling using BERTopic

## 3.1. Set Up Components

### 3.1.1. Embedding Models
Convert text data to numerical vectors

In [None]:
embed_allindo = SentenceTransformer('LazarusNLP/all-indo-e5-small-v4', device = DEVICE)

### 3.1.2. Set Vectorizer Model
Convert embeddings to feature matrix of word's importance and co-occurence within the documents

In [None]:
def load_stopwords(filepath):
  stopwords = set()

  # Add from Tala
  if not os.path.exists(filepath):
      print(f"❌ Error: Stopwords file not found at {filepath}")
      return []

  with open(filepath, 'r', encoding='utf-8') as f:
      tala_stopwords = [line.strip() for line in f if line.strip()]
      stopwords.update(tala_stopwords)

  # Add from Sastrawi
  factory = StopWordRemoverFactory()
  sastrawi_stopwords = factory.get_stop_words()
  stopwords.update(sastrawi_stopwords)

  # List of stopwords found after manually checking the words frequency list AFTER stopwords removal (ADDITIONAL)
  manual_add = ['nya', 'ya', 'ap', 'ok', 'sih', 'deh', 'tau', 'gue', 'kak', 'eh', 'gua', 'tuh', 'lu', 'the', 'by', 'hadeh', 'ku', 'jis', 'an', 'dah', 'mah', 'loh', 'iya', 'you', 'ayo', 'wow', 'jos', 'sip', 'aduh', 'anjir', 'and', 'apatu', 'ah', 'si', 'duh', 'mbak', 'kah', 'amin', 'this', 'mu', 'baiknya', 'berkali', 'kali', 'kurangnya', 'mata', 'olah', 'sekurang', 'setidak', 'tama', 'tidaknya', 'banget', 'pas', 'kayak', 'oke']
  stopwords.update(manual_add)

  return list(stopwords)

In [None]:
stopwords = load_stopwords(STOPWORDS_PATH)

In [None]:
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords)

### 3.1.3. Class TF-IDF Model
Weighting terms based on their relevance to specific topics rather than overall corpus

In [None]:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

## 3.2. Helper Functions

### 3.2.3. Calculate Model Performance

In [None]:
def calculate_topic_coherence(topic_model, docs, top_n=10):
    print("Calculating topic coherence with Gensim...")

    topics = topic_model.get_topics()
    top_n_words = [
        [w for w, _ in words[:top_n]]
        for topic_id, words in topics.items() if topic_id != -1 and len(words) > 0
    ]

    texts = [doc.split() for doc in docs]
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    coherence_score = []
    try:
        coherence_score = CoherenceModel(
            topics=top_n_words, texts=texts, dictionary=dictionary, coherence="c_v"
        ).get_coherence()
    except:
        coherence_score = np.nan
    return coherence_score

In [None]:
def calculate_topic_diversity(topic_model, top_n=10):
    print("Calculating topic diversity...")
    topics = topic_model.get_topics()

    # filter out outlier -1
    topic_ids = [tid for tid in topics.keys() if tid != -1]
    topic_wordsets = []
    for tid in topic_ids:
        words = [w for w, _ in topic_model.get_topic(tid)[:top_n]]
        topic_wordsets.append(set(words))

    # pairwise Jaccard
    overlaps = []
    for i in range(len(topic_wordsets)):
        for j in range(i+1, len(topic_wordsets)):
            a, b = topic_wordsets[i], topic_wordsets[j]
            if len(a.union(b)) == 0:
                continue
            overlaps.append(len(a.intersection(b)) / len(a.union(b)))
    diversity_score = 1 - (sum(overlaps) / len(overlaps)) if overlaps else 0
    print(f"Topic Diversity: {diversity_score}")

    return diversity_score

### 3.2.4. Generate Topics and Visualizations

In [None]:
def create_wordcloud(topic_model, model_folder, N_COLS=3, top_n=30):
    print("Generating wordcloud for all topics...")

    topic_info = topic_model.get_topic_info()
    topics_to_plot = [t for t in topic_info["Topic"].tolist() if t != -1]

    n_topics = len(topics_to_plot)
    if n_topics == 0:
        print("No topics found (excluding -1).")
        return

    N_ROWS = math.ceil(n_topics / N_COLS)
    fig, axes = plt.subplots(N_ROWS, N_COLS, figsize=(N_COLS * 6, N_ROWS * 4), constrained_layout=True)
    axes = axes.flatten() if hasattr(axes, "__iter__") else [axes]

    for i, topic_id in enumerate(topics_to_plot):
        topic_words = [word for word, _ in topic_model.get_topic(topic_id)][:top_n]
        text_for_wordcloud = " ".join(topic_words)
        wc = WordCloud(background_color="white", collocations=False).generate(text_for_wordcloud)
        ax = axes[i]
        ax.imshow(wc, interpolation="bilinear")
        ax.axis("off")
        name_series = topic_info.loc[topic_info["Topic"] == topic_id, "Name"]
        topic_name = name_series.values[0] if len(name_series) > 0 else ""
        ax.set_title(f"Topic {topic_id}: {topic_name}", fontsize=12)

    for j in range(n_topics, N_ROWS * N_COLS):
        fig.delaxes(axes[j])

    model_folder = Path(model_folder)
    model_folder.mkdir(parents=True, exist_ok=True)
    wordcloud_path = model_folder / "wordcloud.png"
    fig.savefig(wordcloud_path, dpi=300, bbox_inches='tight')
    plt.close(fig)

    print(f"✅ WordCloud saved to {wordcloud_path}")

In [None]:
def save_topics_to_txt(topic_model, model_folder, top_n=20):
    print("Saving topic words to text file...")

    topics = topic_model.get_topics()
    lines = []

    for topic_id, topic_words in topics.items():
        words = [f"{word}: {weight:.4f}" for word, weight in topic_words[:top_n]]
        lines.append(f"Topic {topic_id}: {', '.join(words)}\n")
    model_folder = Path(model_folder)
    model_folder.mkdir(parents=True, exist_ok=True)

    with open(model_folder / "topics.txt", "w", encoding="utf-8") as f:
        f.writelines(lines)

    print(f"✅ Topics saved to {model_folder / 'topics.txt'}")

### Build Model

In [None]:
def merge_params(best_params, override_params):
    merged = best_params.copy()
    merged.update(override_params)
    return merged

In [None]:
def build_bertopic_from_params(params, embedding_model, vectorizer_model, ctfidf_model, nr_topics, seed=SEED):
    umap_model = UMAP(
        n_neighbors=params["n_neighbors"],
        n_components=params["n_components"],
        min_dist=0.0,
        metric="cosine",
        random_state=seed
    )

    hdbscan_model = HDBSCAN(
        min_cluster_size=params["min_cluster_size"],
        min_samples=params["min_samples"],
        metric="euclidean",
        cluster_selection_method="eom",
        prediction_data=True
    )

    topic_model = BERTopic(
        embedding_model=embedding_model,
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        min_topic_size=params["min_topic_size"],
        nr_topics=nr_topics,
        calculate_probabilities=False,
        verbose=False
    )

    return topic_model

In [None]:
def train_manual_config(config_id, params, docs_clean, embeddings, embedding_model, vectorizer_model, ctfidf_model, save_root, nr_topics):
    print(f"\nRunning Check: {config_id}")
    print("Params:", params)

    topic_model = build_bertopic_from_params(
        params,
        embedding_model,
        vectorizer_model,
        ctfidf_model,
        nr_topics
    )

    topics, probs = topic_model.fit_transform(docs_clean, embeddings)

    # Metrics
    coh = calculate_topic_coherence(topic_model, docs_clean)
    div = calculate_topic_diversity(topic_model)

    metrics = {
        "c_v": coh,
        "diversity": div,
        **params
    }

    save_path = Path(save_root) / f"manual_config_{config_id}"
    save_path.mkdir(parents=True, exist_ok=True)
    hf_model_path = save_path / "bertopic"

    topic_model.save(
        path=str(hf_model_path),
        serialization="safetensors",
        save_ctfidf=True,
        save_embedding_model="LazarusNLP/all-indo-e5-small-v4"
    )

    print(f"✅ BERTopic safetensors model saved to: {hf_model_path}")

    rep_docs = topic_model.get_representative_docs()

    rep_docs_serializable = {
        str(topic_id): docs
        for topic_id, docs in rep_docs.items()
    }

    rep_docs_path = save_path / "representative_docs.json"

    with open(rep_docs_path, "w", encoding="utf-8") as f:
        json.dump(rep_docs_serializable, f, ensure_ascii=False, indent=2)

    print(f"✅ Representative docs saved to: {rep_docs_path}")

    save_topics_to_txt(topic_model, save_path)
    create_wordcloud(topic_model, save_path)

    pd.DataFrame([metrics]).to_csv(save_path / "metrics.csv", index=False)

    with open(save_path / "params.json", "w") as f:
        json.dump(params, f, indent=2)

    print(f"✅ Run {config_id} saved | Coherence={coh:.4f}")

    return metrics

In [None]:
def pipeline_per_sent(sentiment, df_emoji, df_clean, embed_model, vectorizer_model, ctfidf_model, optuna_best_params, new_hyperparams, save_root, nr_topics
):
    print("="*50)
    print(f"Sentiment: {sentiment}")
    print("="*50)

    mask = df_clean["sentiment"] == sentiment
    docs_clean_raw = df_clean[mask]["cleaned_content"].astype(str).tolist()
    docs_emoji_raw = df_emoji[mask]["cleaned_content"].astype(str).tolist()

    valid_idx = [i for i, d in enumerate(docs_clean_raw) if len(d.split()) > 3]
    docs_clean = [docs_clean_raw[i] for i in valid_idx]
    docs_emoji = [docs_emoji_raw[i] for i in valid_idx]

    emb_path = Path(EMBED_SAVE_ROOT) / "embeddings" / f"indo-e5_{sentiment}_emoji_context.npy"
    embeddings = np.load(emb_path)
    assert len(docs_clean) == embeddings.shape[0], \
    f"Docs ({len(docs_clean)}) and embeddings ({embeddings.shape[0]}) mismatch"


    sentiment_root = Path(save_root) / sentiment
    sentiment_root.mkdir(parents=True, exist_ok=True)

    results = {}

    for i, override in enumerate(new_hyperparams):
        merged_params = merge_params(optuna_best_params, override)
        config_id = f"{i:02d}"

        metrics = train_manual_config(
            config_id=config_id,
            params=merged_params,
            docs_clean=docs_clean,
            embeddings=embeddings,
            embedding_model=embed_model,
            vectorizer_model=vectorizer_model,
            ctfidf_model=ctfidf_model,
            save_root=sentiment_root,
            nr_topics=nr_topics
        )

        results[config_id] = metrics

    pd.DataFrame.from_dict(results, orient="index") \
      .to_csv(sentiment_root / "manual_hp_summary.csv")

    return results

In [None]:
optuna_best_params_pos = {
    'n_neighbors': 17, 'n_components': 9, 'min_cluster_size': 49, 'min_samples': 15, 'min_topic_size': 87
}

In [None]:
hp_list_pos = [
    {'n_neighbors': 5}
    ,{'n_neighbors': 10}
    ,{'n_neighbors': 30}
    ,{'n_neighbors': 35}
    ,{'n_neighbors': 40}
    ,{'n_components': 3}
    ,{'n_components': 15}
    ,{'min_cluster_size': 35}
    ,{'min_cluster_size': 20}
    ,{'min_cluster_size': 70}
    ,{'min_cluster_size': 80}
    ,{'min_cluster_size': 95}
    ,{'min_samples': 5}
    ,{'min_samples': 20}
    ,{'min_topic_size': 30}
    ,{'min_topic_size': 45}
    ,{'min_topic_size': 60}
    ,{'min_topic_size': 75}
    ,{'min_topic_size': 110}
    ,{'min_topic_size': 120}
]

In [None]:
pipeline_per_sent(
    sentiment="positive",
    df_emoji=df_emoji_words,
    df_clean=df_no_emoji,
    embed_model=embed_allindo,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    optuna_best_params=optuna_best_params_pos,
    new_hyperparams=hp_list_pos,
    save_root=SAVE_ROOT,
    nr_topics=13
)

In [None]:
optuna_best_params_neg = {
    'n_neighbors': 28, 'n_components': 9, 'min_cluster_size': 65, 'min_samples': 6, 'min_topic_size': 51
}

In [None]:
hp_list_neg = [
    {'n_neighbors': 20}
    ,{'n_neighbors': 15}
    ,{'n_neighbors': 10}
    ,{'n_neighbors': 35}
    ,{'n_neighbors': 40}
    ,{'n_neighbors': 50}
    ,{'n_components': 5}
    ,{'n_components': 15}
    ,{'min_cluster_size': 50}
    ,{'min_cluster_size': 35}
    ,{'min_cluster_size': 20}
    ,{'min_cluster_size': 75}
    ,{'min_cluster_size': 85}
    ,{'min_cluster_size': 95}
    ,{'min_samples': 3}
    ,{'min_samples': 15}
    ,{'min_samples': 20}
    ,{'min_topic_size': 30}
    ,{'min_topic_size': 45}
    ,{'min_topic_size': 65}
    ,{'min_topic_size': 80}
    ,{'min_topic_size': 110}
    ,{'min_topic_size': 120}
]

In [None]:
pipeline_per_sent(
    sentiment="negative",
    df_emoji=df_emoji_words,
    df_clean=df_no_emoji,
    embed_model=embed_allindo,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    optuna_best_params=optuna_best_params_neg,
    new_hyperparams=hp_list_neg,
    save_root=SAVE_ROOT,
    nr_topics=8
)