In [1]:
PATH_FILE_PREPROCESSED = './results/ds.parquet'
PATH_FILE_ELAPSE_TIME = './results/elapse_time.csv'
PATH_FILE_EVALUATION_LDA = './results/evaluation_lda.csv'
PATH_FILE_EVALUATION_BERTOPIC = './results/evaluation_bertopic.csv'
PATH_PREFIX_MODEL_LDA = './results/models_lda/'
PATH_PREFIX_MODEL_BERTOPIC = './results/models_bertopic/'

In [2]:
import pandas as pd

ds = pd.read_parquet(PATH_FILE_PREPROCESSED)
elapse_time = pd.read_csv(PATH_FILE_ELAPSE_TIME)

In [3]:
from time import time

from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora.dictionary import Dictionary
from bertopic import BERTopic

from utils import get_coherence, get_diversity, get_topics_lda, get_topics_bertopic


def load_eval(variant, bertopic=False):
    t_start = time()
    docs = ds[variant].dropna()
    if bertopic:
        model = BERTopic.load(f'{PATH_PREFIX_MODEL_BERTOPIC}{variant}')
        topics = get_topics_bertopic(model)
        analyzer = model.vectorizer_model.build_analyzer()
        texts = [' '.join(doc) for doc in docs] if 'B' not in v else docs
        texts = [analyzer(doc) for doc in texts]
        dictionary = Dictionary(texts)
    else:
        model = LdaMulticore.load(f'{PATH_PREFIX_MODEL_LDA}{variant}')
        topics = get_topics_lda(model, model.id2word)
        texts = docs
        dictionary = model.id2word
    c = get_coherence(
        topics=topics,
        texts=texts,
        dictionary=dictionary
    )
    d = get_diversity(topics)
    total_time = time() - t_start
    return ({
        'variant': variant,
        'coherence': c,
        'diversity': d,
        'score': c*d
    }, total_time)

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [4]:
from tqdm import tqdm

lda_eval = []
elapse_time_lda = []

for v in (tds := tqdm(ds.columns)):
    tds.set_description(f'Evaluating LDA {v}')
    if 'B' in v: continue
    lda_score, lda_time = load_eval(v)
    lda_eval.append(lda_score)
    elapse_time_lda.append(lda_time)

lda_eval = pd.DataFrame(lda_eval)
elapse_time['lda_evaluation'] = pd.Series(elapse_time_lda)

Evaluating LDA DLWN: 100%|██████████| 2/2 [00:05<00:00,  2.65s/it]


In [5]:
from tqdm import tqdm

bertopic_eval = []
elapse_time_bertopic = []

for v in (tds := tqdm(ds.columns)):
    tds.set_description(f'Evaluating BERTopic {v}')
    if 'T' in v: continue
    bertopic_score, bertopic_time = load_eval(v, bertopic=True)
    bertopic_eval.append(bertopic_score)
    elapse_time_bertopic.append(bertopic_time)

bertopic_eval = pd.DataFrame(bertopic_eval)
elapse_time['bertopic_evaluation'] = pd.Series(elapse_time_bertopic)

Evaluating BERTopic DLWN: 100%|██████████| 2/2 [00:14<00:00,  7.11s/it]


In [6]:
lda_eval

Unnamed: 0,variant,coherence,diversity,score
0,dLWN,0.33359,0.28,0.093405
1,DLWN,0.42329,0.344944,0.146011


In [7]:
bertopic_eval

Unnamed: 0,variant,coherence,diversity,score
0,dLWN,0.486004,0.901282,0.438026
1,DLWN,0.626806,1.0,0.626806


In [8]:
elapse_time

Unnamed: 0,variant,nlp,preprocessing,lda_training,bertopic_training,lda_evaluation,bertopic_evaluation
0,dLWN,267.22045,0.790498,13.949261,285.131878,2.156977,8.765084
1,DLWN,267.22045,1.186545,12.185707,30.66199,3.115971,4.547993


In [9]:
elapse_time.to_csv(PATH_FILE_ELAPSE_TIME, index=False)
lda_eval.to_csv(PATH_FILE_EVALUATION_LDA, index=False)
bertopic_eval.to_csv(PATH_FILE_EVALUATION_BERTOPIC, index=False)