In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired
from tqdm import tqdm, trange
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP
import pandas as pd
import numpy as np

In [None]:
umap_reducer = UMAP(n_neighbors=15, n_components=5, metric='cosine', min_dist=0.0)

In [None]:
hdbscan_clusterer = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom',
                            prediction_data=True)

In [None]:
vectorizer_model = CountVectorizer(min_df=0.20, ngram_range=(1, 2))

In [None]:
keybert_model = KeyBERTInspired()

representation_model = {
    "KeyBERT": keybert_model,
}

In [None]:
topic_model = BERTopic(
    language='multilingual',

    # Pipeline models
    embedding_model=SentenceTransformer(model_name_or_path='paraphrase-multilingual-MiniLM-L12-v2'),
    # umap_model=umap_reducer,
    # hdbscan_model=hdbscan_clusterer,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,

    # Hyperparameters
    top_n_words=10,
    verbose=True,
)

In [None]:
prefix_dataset = 'teknik-informatika'

data = pd.read_csv(f'../data/preprocessed/{prefix_dataset}-preprocessed.csv', index_col=0)
data = pd.Series(data['abstract'])

embeddings = np.load(f'../data/embeddings/{prefix_dataset}.npy')

topics, probs = topic_model.fit_transform(data, embeddings)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(-1, full=True)

In [None]:
topic_model.visualize_hierarchy(custom_labels=True)

In [None]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [None]:
topic_model.visualize_documents(data, reduced_embeddings=reduced_embeddings, custom_labels=True)