In [1]:
PATH_FILE_PREPROCESSED = './results/ds.parquet'
PATH_FILE_ELAPSE_TIME = './results/elapse_time.csv'
PATH_PREFIX_MODEL_LDA = './results/models_lda/'
PATH_PREFIX_MODEL_BERTOPIC = './results/models_bertopic/'
USE_GPU = True

In [2]:
import pandas as pd

ds = pd.read_parquet(PATH_FILE_PREPROCESSED)
elapse_time = pd.read_csv(PATH_FILE_ELAPSE_TIME)

In [3]:
ds['dB'].head()

0    Akhir Kisah Misterius Kematian Satu Keluarga d...
1    Direktur Reserse Kriminal Umum Polda Metro Jay...
2    Proses penyelidikan yang telah berjalan selama...
3    Polisi juga tidak menemukan minimal dua barang...
4    Motif bunuh diri atau pembunuhan juga tidak ad...
Name: dB, dtype: object

In [4]:
elapse_time.head()

Unnamed: 0,variant,tokenizing,preprocessing
0,dB,56.51815,0.004319
1,DB,56.51815,0.00432
2,dT,56.51815,0.032136
3,DT,56.51815,0.032136
4,dCLNP,56.51815,0.240506


In [5]:
from time import time

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from bertopic import BERTopic
from gensim.corpora.dictionary import Dictionary
from utils import get_coherence, get_diversity
from utils import get_topics_bertopic

if USE_GPU:
    from cuml.cluster import HDBSCAN
    from cuml.manifold import UMAP

def get_bertopic(v, verbose=False, get_model=False, get_docs=False, get_score=False):
    start = time()
    docs = ds[v].dropna()
    docs = [' '.join(doc) for doc in docs] if 'B' not in v else docs
    umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
    hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)
    model = BERTopic(
        language='multilingual',
        hdbscan_model=hdbscan_model,
        n_gram_range=(1,3),
        nr_topics='auto',
        umap_model=umap_model,
        verbose=verbose,
    )
    topics_d, probs = model.fit_transform(docs)
    topics_w = get_topics_bertopic(model, all=True)
    end = time()
    result = {
        'topics_d': topics_d,
        'topics_w': topics_w,
        'time': end - start
    }
    if get_model: result['model'] = model
    if get_docs: result['docs'] = docs
    if get_score:
        analyzer = model.vectorizer_model.build_analyzer()
        texts = [analyzer(doc) for doc in docs]
        dictionary = Dictionary(texts)
        try:
            c = get_coherence(topics=topics_w, texts=texts, dictionary=dictionary)
        except ValueError:
            c = -1
        d = get_diversity(topics=topics_w)
        end = time()
        result['coherence'] = c
        result['diversity'] = d
        result['score'] = c * d
        result['time'] = end - start
    return result

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [6]:
for v in ds.columns:
    model = get_bertopic(v, get_score=True, verbose=False)
    print(f"{v}{model['score']}")

dB0.6717075907201822
DB0.5936742696832032
dT0.6382638689972987
DT0.5961457224556992
dCLNP0.40251067781481303
DCLNP0.7985681390027958
dCLNG0.5325374112900989
DCLNG0.6648666828236153
dCLWP0.4370102290388285
DCLWP0.862278809710521
CUDA call='cudaEventDestroy(event_)' at file=/home/lab_sc/mambaforge/envs/lba/include/raft/core/resource/cuda_event.hpp line=33 failed with initialization error
CUDA call='cudaEventDestroy(event_)' at file=/home/lab_sc/mambaforge/envs/lba/include/raft/core/resource/cuda_event.hpp line=33 failed with initialization error
CUDA call='cudaEventDestroy(event_)' at file=/home/lab_sc/mambaforge/envs/lba/include/raft/core/resource/cuda_event.hpp line=33 failed with initialization error
CUDA call='cudaEventDestroy(event_)' at file=/home/lab_sc/mambaforge/envs/lba/include/raft/core/resource/cuda_event.hpp line=33 failed with initialization error
dCLWG0.7444082863528788
DCLWG0.5333630044619821
dCSNP0.41785881660622054
DCSNP0.6494524776256811
dCSNG0.5639688760967217
DCSNG0.

In [7]:
ds.columns

Index(['dB', 'DB', 'dT', 'DT', 'dCLNP', 'DCLNP', 'dCLNG', 'DCLNG', 'dCLWP',
       'DCLWP', 'dCLWG', 'DCLWG', 'dCSNP', 'DCSNP', 'dCSNG', 'DCSNG', 'dCSWP',
       'DCSWP', 'dCSWG', 'DCSWG'],
      dtype='object')