In [1]:
PATH_FILE_PREPROCESSED = './results/ds.parquet'
PATH_FILE_ELAPSE_TIME = './results/elapse_time.csv'
PATH_FILE_EVALUATION_LDA = './results/evaluation_lda.csv'
PATH_FILE_EVALUATION_BERTOPIC = './results/evaluation_bertopic.csv'
PATH_PREFIX_MODEL_LDA = './results/models_lda/'
PATH_PREFIX_MODEL_BERTOPIC = './results/models_bertopic/'

In [2]:
import pandas as pd

ds = pd.read_parquet(PATH_FILE_PREPROCESSED)
elapse_time = pd.read_csv(PATH_FILE_ELAPSE_TIME)

In [3]:
from time import time

from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora.dictionary import Dictionary
from bertopic import BERTopic

from utils import get_coherence, get_diversity, get_topics_lda, get_topics_bertopic


def load_eval(variant, bertopic=False):
    t_start = time()
    docs = ds[variant].dropna()
    if bertopic:
        model = BERTopic.load(f'{PATH_PREFIX_MODEL_BERTOPIC}{variant}')
        topics = get_topics_bertopic(model)
        analyzer = model.vectorizer_model.build_analyzer()
        sentences = [' '.join(doc) for doc in docs] if 'B' not in variant else docs
        texts = [analyzer(doc) for doc in sentences]
        dictionary = Dictionary(texts)
    else:
        model = LdaMulticore.load(f'{PATH_PREFIX_MODEL_LDA}{variant}')
        topics = get_topics_lda(model, model.id2word)
        texts = docs
        dictionary = model.id2word
    c = get_coherence(
        topics=topics,
        texts=texts,
        dictionary=dictionary
    )
    d = get_diversity(topics)
    total_time = time() - t_start
    return ({
        'variant': variant,
        'coherence': c,
        'diversity': d,
        'score': c*d
    }, total_time)

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [19]:
from tqdm import tqdm

lda_eval = []
elapse_time_lda = []

for v in (tds := tqdm(ds.columns)):
    tds.set_description(f'Evaluating LDA {v}')
    if 'B' in v or 'G' in v: 
        elapse_time_lda.append(0)
        lda_eval.append({
            'variant': v,
            'coherence': 0,
            'diversity': 0,
            'score': 0
        })
        continue
    lda_score, lda_time = load_eval(v)
    lda_eval.append(lda_score)
    elapse_time_lda.append(lda_time)

lda_eval = pd.DataFrame(lda_eval)
elapse_time['lda_evaluation'] = pd.Series(elapse_time_lda)

Evaluating LDA DT: 100%|██████████| 12/12 [00:04<00:00,  2.40it/s]


In [14]:
from tqdm import tqdm

bertopic_eval = []
elapse_time_bertopic = []

for v in (tds := tqdm(ds.columns)):
    tds.set_description(f'Evaluating BERTopic {v}')
    if 'T' in v: 
        elapse_time_bertopic.append(0)
        bertopic_eval.append({
            'variant': v,
            'coherence': 0,
            'diversity': 0,
            'score': 0
        })
        continue
    bertopic_score, bertopic_time = load_eval(v, bertopic=True)
    bertopic_eval.append(bertopic_score)
    elapse_time_bertopic.append(bertopic_time)

bertopic_eval = pd.DataFrame(bertopic_eval)
elapse_time['bertopic_evaluation'] = pd.Series(elapse_time_bertopic)

Evaluating BERTopic DT: 100%|██████████| 12/12 [00:25<00:00,  2.16s/it]


In [20]:
lda_eval

Unnamed: 0,variant,coherence,diversity,score
0,dLG,0.0,0.0,0.0
1,DLG,0.0,0.0,0.0
2,dSP,0.456344,0.081818,0.037337
3,DSP,0.334429,0.040426,0.013519
4,dLN,0.394823,0.053488,0.021118
5,DLN,0.339342,0.371605,0.126101
6,dLW,0.478372,0.04,0.019135
7,DLW,0.403932,0.24507,0.098992
8,dB,0.0,0.0,0.0
9,DB,0.0,0.0,0.0


In [16]:
bertopic_eval

Unnamed: 0,variant,coherence,diversity,score
0,dLG,0.656973,0.882143,0.579544
1,DLG,0.742686,0.611111,0.453864
2,dSP,0.655239,0.653333,0.428089
3,DSP,0.373837,0.65,0.242994
4,dLN,0.507532,0.8312,0.42186
5,DLN,0.626244,0.955556,0.598411
6,dLW,0.760707,0.515789,0.392365
7,DLW,0.619123,0.875,0.541733
8,dB,0.569956,0.863866,0.492365
9,DB,0.365551,0.7,0.255886


In [17]:
elapse_time

Unnamed: 0,variant,tokenizing,preprocessing,lda_training,bertopic_training,lda_evaluation,bertopic_evaluation
0,dLG,2.061464,0.076667,0.0,10.91504,0.0,6.067646
1,DLG,2.061464,0.076667,0.0,3.236732,0.0,4.375084
2,dSP,2.061464,2.339595,1.55544,5.950782,0.49657,1.409607
3,DSP,2.061464,2.743002,6.910429,1.89604,0.590538,1.423903
4,dLN,2.061464,0.080067,1.499174,5.1091,0.53597,2.693821
5,DLN,2.061464,0.080067,9.134171,1.716769,0.839682,1.345748
6,dLW,2.061464,0.155874,1.580441,5.535383,0.528876,1.431065
7,DLW,2.061464,0.155875,6.300422,1.939281,0.759805,1.355839
8,dB,2.061464,0.006221,0.0,5.962377,0.0,2.803576
9,DB,2.061464,0.006221,0.0,1.91178,0.0,1.393406


In [18]:
elapse_time.to_csv(PATH_FILE_ELAPSE_TIME, index=False)
lda_eval.to_csv(PATH_FILE_EVALUATION_LDA, index=False)
bertopic_eval.to_csv(PATH_FILE_EVALUATION_BERTOPIC, index=False)