In [1]:
import pandas as pd

HAS_p = pd.read_parquet('./results/HAS_p.parquet')
HAS_t = pd.read_parquet('./results/HAS_t.parquet')

In [2]:
from bertopic import BERTopic
from gensim.models.ldamulticore import LdaMulticore
from utils import e_variant

def load_ldas():
    r = {} # result
    for v in e_variant():
        r[v] = LdaMulticore.load(f'./results/models_lda/{v}')
    return r

def load_bertopics():
    path_prefix = './results/models_bertopic/'
    r = {}
    for v in e_variant():
        r[v] = {}
        r[v] = BERTopic.load(f'{path_prefix}{v}')
    return r

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [3]:
ldas = load_ldas()
bertopics = load_bertopics()

In [4]:
from time import time
from gensim.corpora.dictionary import Dictionary
from utils import get_topics_lda, get_topics_bertopic, get_coherence, get_diversity

def evaluate_models(models, lda=False):
    r = []
    r_time = []
    for v in HAS_p:
        t_start = time()
        docs = HAS_p[v].dropna()
        if lda:
            topics = get_topics_lda(models[v], models[v].id2word)
            texts = docs
            dictionary = models[v].id2word
        else:
            topics = get_topics_bertopic(bertopics[v])
            analyzer = models[v].vectorizer_model.build_analyzer()
            texts = [' '.join(doc) for doc in docs]
            texts = [analyzer(doc) for doc in texts]
            dictionary = Dictionary(texts)
        c = get_coherence(
            topics=topics,
            texts=texts,
            dictionary=dictionary,
        )
        d = get_diversity(topics)
        t_end = time()
        r_time.append(t_end - t_start)
        r.append({
            'variant': v,
            'coherence': c,
            'diversity': d,
            'score': c * d
        })
    algorithm = 'lda' if lda else 'bertopic'
    HAS_t[f'{algorithm}_evaluation'] = pd.Series(r_time)
    return pd.DataFrame(r)

In [5]:
import pandas as pd

evaluation_lda = pd.DataFrame(evaluate_models(ldas, lda=True))

In [6]:
evaluation_bertopic = pd.DataFrame(evaluate_models(bertopics))

In [7]:
HAS_t

Unnamed: 0,variant,segmentation,preprocess,lda_training,bertopic_training,lda_evaluation,bertopic_evaluation
0,HWN,0.0,1.334094,14.32213,15.278703,4.064417,1.736998
1,HCLWN,0.0,1.333092,12.726939,5.869205,2.12673,1.806417
2,AWN,1692513000.0,33.333024,27.309271,9.887508,4.943865,3.837187
3,ACLWN,1692513000.0,33.355028,22.753088,10.352225,4.756558,3.669254
4,SWN,26.37917,31.43803,25.11792,36.455737,6.476206,4.51146
5,SCLWN,26.37917,31.439039,26.927279,32.640802,6.026889,4.078358


In [8]:
evaluation_lda

Unnamed: 0,variant,coherence,diversity,score
0,HWN,0.528739,0.431667,0.228239
1,HCLWN,0.407186,0.94,0.382755
2,AWN,0.460583,0.455738,0.209905
3,ACLWN,0.447095,0.430508,0.192478
4,SWN,0.374761,0.668421,0.250498
5,SCLWN,0.434822,0.647368,0.28149


In [9]:
evaluation_bertopic

Unnamed: 0,variant,coherence,diversity,score
0,HWN,0.510371,1.0,0.510371
1,HCLWN,0.532636,1.0,0.532636
2,AWN,0.292424,1.0,0.292424
3,ACLWN,0.342356,1.0,0.342356
4,SWN,0.49123,0.974286,0.478598
5,SCLWN,0.496585,0.948148,0.470837
