In [1]:
import pandas as pd

ds = pd.read_parquet('./results/ds.parquet')
elapse_time = pd.read_csv('./results/elapse_time.csv')

In [None]:
from bertopic import BERTopic
from gensim.models.ldamulticore import LdaMulticore
from tqdm import tqdm


def load_ldas():
    r = {} # result
    for v in (tds := tqdm(ds.columns)):
        tds.set_description(f'Loading LDA {v}')
        if 'B' in v: continue
        r[v] = LdaMulticore.load(f'./results/models_lda/{v}')
    return r

def load_bertopics():
    path_prefix = './results/models_bertopic/'
    r = {}
    for v in (tds := tqdm(ds.columns)):
        tds.set_description(f'Loading BERTopic {v}')
        if 'T' in v: continue
        r[v] = BERTopic.load(f'{path_prefix}{v}')
    return r

In [3]:
ldas = load_ldas()
bertopics = load_bertopics()

Loading LDA DSFN: 100%|██████████| 4/4 [00:00<00:00, 29.85it/s]
Loading BERTopic DSFN: 100%|██████████| 4/4 [00:10<00:00,  2.74s/it]


In [11]:
from time import time
from gensim.corpora.dictionary import Dictionary
from utils import get_topics_lda, get_topics_bertopic, get_coherence, get_diversity

def evaluate_models(models, lda=False):
    r = []
    r_time = []
    for v in (tds := tqdm(ds.columns)):
        if lda:
            tds.set_description(f'Evaluating LDA {v}')
            if 'B'in v:
                r_time.append(0)
                continue
        else:
            tds.set_description(f'Evaluating BERTopic {v}')
            if 'T'in v:
                r_time.append(0)
                continue
        t_start = time()
        docs = ds[v].dropna()
        if lda:
            topics = get_topics_lda(models[v], models[v].id2word)
            texts = docs
            dictionary = models[v].id2word
        else:
            topics = get_topics_bertopic(bertopics[v])
            analyzer = models[v].vectorizer_model.build_analyzer()
            texts = [' '.join(doc) for doc in docs] if 'B' not in v else docs
            texts = [analyzer(doc) for doc in texts]
            dictionary = Dictionary(texts)
        c = get_coherence(
            topics=topics,
            texts=texts,
            dictionary=dictionary
        )
        d = get_diversity(topics)
        t_end = time()
        r_time.append(t_end - t_start)
        r.append({
            'variant': v,
            'coherence': c,
            'diversity': d,
            'score': c * d
        })
    algorithm = 'lda' if lda else 'bertopic'
    elapse_time[f'{algorithm}_evaluation'] = pd.Series(r_time)
    return pd.DataFrame(r)

In [5]:
import pandas as pd

evaluation_lda = pd.DataFrame(evaluate_models(ldas, lda=True))

Evaluating LDA DSFN: 100%|██████████| 4/4 [00:25<00:00,  6.46s/it]


In [6]:
evaluation_bertopic = pd.DataFrame(evaluate_models(bertopics))

Evaluating BERTopic DSFN: 100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


In [7]:
elapse_time

Unnamed: 0,variant,nlp,preprocessing,lda_training,bertopic_training,lda_evaluation,bertopic_evaluation
0,dLWN,224.184062,7.460976,31.896072,259.104044,5.766991,3.903425
1,DLWN,224.184062,8.284982,32.851871,59.923101,6.319564,4.111557
2,dSFN,224.184062,1.6997,35.446574,178.02453,6.702214,4.079586
3,DSFN,224.184062,2.410262,31.569406,59.789309,7.053438,3.986603


In [8]:
evaluation_lda

Unnamed: 0,variant,coherence,diversity,score
0,dLWN,0.313952,0.371831,0.116737
1,DLWN,0.321021,0.371831,0.119366
2,dSFN,0.315888,0.39375,0.124381
3,DSFN,0.31996,0.39375,0.125984


In [9]:
evaluation_bertopic

Unnamed: 0,variant,coherence,diversity,score
0,dLWN,0.548878,0.966667,0.530582
1,DLWN,0.623632,0.966667,0.602845
2,dSFN,0.4523,0.9,0.40707
3,DSFN,0.58603,0.9,0.527427


In [10]:
elapse_time.to_csv('./results/elapse_time.csv', index=False)
evaluation_lda.to_csv('./results/evaluation_lda.csv', index=False)
evaluation_bertopic.to_csv('./results/evaluation_bertopic.csv', index=False)