In [1]:
PATH_FILE_PREPROCESSED = './results/ds.parquet'
PATH_FILE_ELAPSE_TIME = './results/elapse_time.csv'
PATH_FILE_EVALUATION_LDA = './results/evaluation_lda.csv'
PATH_FILE_EVALUATION_BERTOPIC = './results/evaluation_bertopic.csv'
PATH_PREFIX_MODEL_LDA = './results/models_lda/'
PATH_PREFIX_MODEL_BERTOPIC = './results/models_bertopic/'

In [2]:
import pandas as pd

ds = pd.read_parquet(PATH_FILE_PREPROCESSED)
elapse_time = pd.read_csv(PATH_FILE_ELAPSE_TIME)

In [10]:
from time import time

from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora.dictionary import Dictionary
from bertopic import BERTopic

from utils import get_coherence, get_diversity, get_topics_lda, get_topics_bertopic


def load_eval(variant, bertopic=False):
    t_start = time()
    docs = ds[variant].dropna()
    if bertopic:
        model = BERTopic.load(f'{PATH_PREFIX_MODEL_BERTOPIC}{variant}')
        topics = get_topics_bertopic(model)
        analyzer = model.vectorizer_model.build_analyzer()
        texts = [' '.join(doc) for doc in docs]
        texts = [analyzer(doc) for doc in texts]
        dictionary = Dictionary(texts)
    else:
        model = LdaMulticore.load(f'{PATH_PREFIX_MODEL_LDA}{variant}')
        topics = get_topics_lda(model, model.id2word)
        texts = docs
        dictionary = model.id2word
    c = get_coherence(
        topics=topics,
        texts=texts,
        dictionary=dictionary
    )
    d = get_diversity(topics)
    total_time = time() - t_start
    return ({
        'variant': variant,
        'coherence': c,
        'diversity': d,
        'score': c*d
    }, total_time)

In [4]:
from tqdm import tqdm

lda_eval = []
elapse_time_lda = []

for v in (tds := tqdm(ds.columns)):
    tds.set_description(f'Evaluating LDA {v}')
    if 'B' in v: 
        elapse_time_lda.append(0)
        continue
    lda_score, lda_time = load_eval(v)
    lda_eval.append(lda_score)
    elapse_time_lda.append(lda_time)

lda_eval = pd.DataFrame(lda_eval)
elapse_time['lda_evaluation'] = pd.Series(elapse_time_lda)

Evaluating LDA DSWP: 100%|██████████| 4/4 [00:05<00:00,  1.36s/it]


In [11]:
from tqdm import tqdm

bertopic_eval = []
elapse_time_bertopic = []

for v in (tds := tqdm(ds.columns)):
    tds.set_description(f'Evaluating BERTopic {v}')
    if 'T' in v: 
        elapse_time_bertopic.append(0)
        continue
    bertopic_score, bertopic_time = load_eval(v, bertopic=True)
    bertopic_eval.append(bertopic_score)
    elapse_time_bertopic.append(bertopic_time)

bertopic_eval = pd.DataFrame(bertopic_eval)
elapse_time['bertopic_evaluation'] = pd.Series(elapse_time_bertopic)

Evaluating BERTopic DSWP: 100%|██████████| 4/4 [00:06<00:00,  1.59s/it]


In [6]:
lda_eval

Unnamed: 0,variant,coherence,diversity,score
0,dSG,0.415888,0.0175,0.007278
1,DSG,0.343492,0.02,0.00687
2,dSWP,0.375896,0.614754,0.231084
3,DSWP,0.469698,0.496875,0.233381


In [12]:
bertopic_eval

Unnamed: 0,variant,coherence,diversity,score
0,dSG,0.702989,0.928788,0.652928
1,DSG,0.78529,0.7,0.549703
2,dSWP,0.578578,0.955814,0.553013
3,DSWP,0.54396,1.0,0.54396


In [8]:
elapse_time

Unnamed: 0,variant,tokenizing,preprocessing,lda_training,bertopic_training,lda_evaluation,bertopic_evaluation
0,dSG,4.591405,1.53467,4.760596,12.345006,0.793283,2.095778
1,DSG,4.591405,1.534672,4.620546,3.080706,1.06545,1.221303
2,dSWP,4.591405,0.573513,7.186311,6.88452,1.121705,2.247856
3,DSWP,4.591405,0.702334,7.947597,2.630113,2.438741,0.304405


In [9]:
elapse_time.to_csv(PATH_FILE_ELAPSE_TIME, index=False)
lda_eval.to_csv(PATH_FILE_EVALUATION_LDA, index=False)
bertopic_eval.to_csv(PATH_FILE_EVALUATION_BERTOPIC, index=False)