In [1]:
PATH_FILE_PREPROCESSED = './results/ds.parquet'
PATH_FILE_ELAPSE_TIME = './results/elapse_time.csv'
PATH_FILE_EVALUATION_LDA = './results/evaluation_lda.csv'
PATH_FILE_EVALUATION_BERTOPIC = './results/evaluation_bertopic.csv'
PATH_PREFIX_MODEL_LDA = './results/models_lda/'
PATH_PREFIX_MODEL_BERTOPIC = './results/models_bertopic/'

In [2]:
import pandas as pd

ds = pd.read_parquet(PATH_FILE_PREPROCESSED)
elapse_time = pd.read_csv(PATH_FILE_ELAPSE_TIME)

In [3]:
from time import time

from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora.dictionary import Dictionary
from bertopic import BERTopic

from utils import get_coherence, get_diversity, get_topics_lda, get_topics_bertopic


def load_eval(variant, bertopic=False):
    t_start = time()
    docs = ds[variant].dropna()
    if bertopic:
        model = BERTopic.load(f'{PATH_PREFIX_MODEL_BERTOPIC}{variant}')
        topics = get_topics_bertopic(model)
        analyzer = model.vectorizer_model.build_analyzer()
        texts = [' '.join(doc) for doc in docs] if 'B' not in v else docs
        texts = [analyzer(doc) for doc in texts]
        dictionary = Dictionary(texts)
    else:
        model = LdaMulticore.load(f'{PATH_PREFIX_MODEL_LDA}{variant}')
        topics = get_topics_lda(model, model.id2word)
        texts = docs
        dictionary = model.id2word
    c = get_coherence(
        topics=topics,
        texts=texts,
        dictionary=dictionary
    )
    d = get_diversity(topics)
    total_time = time() - t_start
    return ({
        'variant': variant,
        'coherence': c,
        'diversity': d,
        'score': c*d
    }, total_time)

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [4]:
from tqdm import tqdm

lda_eval = []
elapse_time_lda = []

for v in (tds := tqdm(ds.columns)):
    tds.set_description(f'Evaluating LDA {v}')
    if 'B' in v: 
        elapse_time_lda.append(0)
        continue
    lda_score, lda_time = load_eval(v)
    lda_eval.append(lda_score)
    elapse_time_lda.append(lda_time)

lda_eval = pd.DataFrame(lda_eval)
elapse_time['lda_evaluation'] = pd.Series(elapse_time_lda)

Evaluating LDA DCLW: 100%|██████████| 4/4 [00:04<00:00,  1.22s/it]


In [5]:
from tqdm import tqdm

bertopic_eval = []
elapse_time_bertopic = []

for v in (tds := tqdm(ds.columns)):
    tds.set_description(f'Evaluating BERTopic {v}')
    if 'T' in v: 
        elapse_time_bertopic.append(0)
        continue
    bertopic_score, bertopic_time = load_eval(v, bertopic=True)
    bertopic_eval.append(bertopic_score)
    elapse_time_bertopic.append(bertopic_time)

bertopic_eval = pd.DataFrame(bertopic_eval)
elapse_time['bertopic_evaluation'] = pd.Series(elapse_time_bertopic)

Evaluating BERTopic DCLW: 100%|██████████| 4/4 [00:04<00:00,  1.08s/it]


In [6]:
lda_eval

Unnamed: 0,variant,coherence,diversity,score
0,dCSW,0.366997,0.043056,0.015801
1,DCSW,0.417495,0.437895,0.182819
2,dCLW,0.342679,0.308571,0.105741
3,DCLW,0.405463,0.167391,0.067871


In [7]:
bertopic_eval

Unnamed: 0,variant,coherence,diversity,score
0,dCSW,0.646014,0.928916,0.600092
1,DCSW,0.598728,0.95,0.568792
2,dCLW,0.640805,0.914118,0.585771
3,DCLW,0.584094,0.95,0.554889


In [8]:
elapse_time

Unnamed: 0,variant,nlp,preprocessing,lda_training,bertopic_training,lda_evaluation,bertopic_evaluation
0,dCSW,40.981253,1.658864,3.349057,6.923768,0.699681,1.822078
1,DCSW,40.981253,1.658865,7.551509,2.469865,2.293074,0.270443
2,dCLW,40.981253,0.075823,6.950676,6.947025,1.037146,1.932556
3,DCLW,40.981253,0.075824,3.171497,2.534088,0.839975,0.272668


In [9]:
elapse_time.to_csv(PATH_FILE_ELAPSE_TIME, index=False)
lda_eval.to_csv(PATH_FILE_EVALUATION_LDA, index=False)
bertopic_eval.to_csv(PATH_FILE_EVALUATION_BERTOPIC, index=False)