In [1]:
PATH_FILE_PREPROCESSED = './results/ds.parquet'
PATH_FILE_ELAPSE_TIME = './results/elapse_time.csv'
PATH_FILE_EVALUATION_LDA = './results/evaluation_lda.csv'
PATH_FILE_EVALUATION_BERTOPIC = './results/evaluation_bertopic.csv'
PATH_PREFIX_MODEL_LDA = './results/models_lda/'
PATH_PREFIX_MODEL_BERTOPIC = './results/models_bertopic/'

In [2]:
import pandas as pd

ds = pd.read_parquet(PATH_FILE_PREPROCESSED)
elapse_time = pd.read_csv(PATH_FILE_ELAPSE_TIME)

In [3]:
from time import time

from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora.dictionary import Dictionary
from bertopic import BERTopic

from utils import get_coherence, get_diversity, get_topics_lda, get_topics_bertopic


def load_eval(variant, bertopic=False):
    t_start = time()
    docs = ds[variant].dropna()
    if bertopic:
        model = BERTopic.load(f'{PATH_PREFIX_MODEL_BERTOPIC}{variant}')
        topics = get_topics_bertopic(model)
        analyzer = model.vectorizer_model.build_analyzer()
        texts = [' '.join(doc) for doc in docs] if 'B' not in v else docs
        texts = [analyzer(doc) for doc in texts]
        dictionary = Dictionary(texts)
    else:
        model = LdaMulticore.load(f'{PATH_PREFIX_MODEL_LDA}{variant}')
        topics = get_topics_lda(model, model.id2word)
        texts = docs
        dictionary = model.id2word
    c = get_coherence(
        topics=topics,
        texts=texts,
        dictionary=dictionary
    )
    d = get_diversity(topics)
    total_time = time() - t_start
    return ({
        'variant': variant,
        'coherence': c,
        'diversity': d,
        'score': c*d
    }, total_time)

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [10]:
from tqdm import tqdm

lda_eval = []
elapse_time_lda = []

for v in (tds := tqdm(ds.columns)):
    tds.set_description(f'Evaluating LDA {v}')
    if 'B' in v: 
        elapse_time_lda.append(0)
        continue
    lda_score, lda_time = load_eval(v)
    lda_eval.append(lda_score)
    elapse_time_lda.append(lda_time)

lda_eval = pd.DataFrame(lda_eval)
elapse_time['lda_evaluation'] = pd.Series(elapse_time_lda)

Evaluating LDA dC:   0%|          | 0/110 [00:00<?, ?it/s]

Evaluating LDA DCSWP: 100%|██████████| 110/110 [02:41<00:00,  1.47s/it]


In [11]:
from tqdm import tqdm

bertopic_eval = []
elapse_time_bertopic = []

for v in (tds := tqdm(ds.columns)):
    tds.set_description(f'Evaluating BERTopic {v}')
    if 'T' in v: 
        elapse_time_bertopic.append(0)
        continue
    bertopic_score, bertopic_time = load_eval(v, bertopic=True)
    bertopic_eval.append(bertopic_score)
    elapse_time_bertopic.append(bertopic_time)

bertopic_eval = pd.DataFrame(bertopic_eval)
elapse_time['bertopic_evaluation'] = pd.Series(elapse_time_bertopic)

Evaluating BERTopic DCSWP: 100%|██████████| 110/110 [02:51<00:00,  1.56s/it]


In [12]:
lda_eval

Unnamed: 0,variant,coherence,diversity,score
0,dC,0.430169,0.115556,0.049708
1,DC,0.334811,0.021795,0.007297
2,dG,0.443337,0.016327,0.007238
3,DG,0.356225,0.148000,0.052721
4,dL,0.447723,0.146429,0.065559
...,...,...,...,...
103,DCSNP,0.401387,0.542254,0.217653
104,dCSWG,0.365443,0.038710,0.014146
105,DCSWG,0.427185,0.439773,0.187864
106,dCSWP,0.298078,0.045098,0.013443


In [13]:
bertopic_eval

Unnamed: 0,variant,coherence,diversity,score
0,dB,0.599876,0.919718,0.551717
1,DB,0.371063,0.666667,0.247376
2,dC,0.609708,0.919178,0.560430
3,DC,0.344039,0.633333,0.217891
4,dG,0.616213,0.901408,0.555460
...,...,...,...,...
103,DCSNP,0.599114,1.000000,0.599114
104,dCSWG,0.641822,0.929268,0.596425
105,DCSWG,0.598728,0.950000,0.568792
106,dCSWP,0.588604,0.954118,0.561597


In [14]:
elapse_time

Unnamed: 0,variant,nlp,preprocessing,lda_training,bertopic_training,lda_evaluation,bertopic_evaluation
0,dB,212.552485,0.001777,0.000000,18.465299,0.000000,3.152477
1,DB,212.552485,0.001778,0.000000,3.485061,0.000000,0.908185
2,dC,212.552485,0.043353,8.620242,8.633718,1.217710,2.765370
3,DC,212.552485,0.043354,6.784524,4.135852,1.887860,0.761577
4,dG,212.552485,0.035941,6.479580,9.766098,1.576258,2.770796
...,...,...,...,...,...,...,...
105,DCSNP,212.552485,0.759058,7.060609,4.202968,2.239136,0.263625
106,dCSWG,212.552485,0.640932,5.686913,8.407741,1.241497,2.359222
107,DCSWG,212.552485,0.640937,12.209638,4.086326,4.199260,0.346498
108,dCSWP,212.552485,0.627230,4.129492,8.897246,1.137989,2.732410


In [15]:
elapse_time.to_csv(PATH_FILE_ELAPSE_TIME, index=False)
lda_eval.to_csv(PATH_FILE_EVALUATION_LDA, index=False)
bertopic_eval.to_csv(PATH_FILE_EVALUATION_BERTOPIC, index=False)