In [2]:
import pickle
import pandas as pd

with open('./results/HAS_e.pickle', 'rb') as f:
    HAS_e = pickle.load(f)
HAS_t = pd.read_parquet('./results/HAS_t.parquet')

In [3]:
from bertopic import BERTopic
from gensim.models.ldamulticore import LdaMulticore
from utils import e_variant

def load_ldas():
    r = {} # result
    for v in e_variant():
        r[v] = LdaMulticore.load(f'./results/models_lda/{v}')
    return r

def load_bertopics():
    path_prefix = './results/models_bertopic/'
    r = {}
    for v in e_variant():
        r[v] = {}
        r[v] = BERTopic.load(f'{path_prefix}{v}')
    return r

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [4]:
ldas = load_ldas()
bertopics = load_bertopics()

In [11]:
from time import time
from gensim.corpora.dictionary import Dictionary
from utils import get_topics_lda, get_topics_bertopic, get_coherence, get_diversity

def evaluate_models(models, lda=False):
    r = []
    r_time = []
    for v in HAS_e:
        t_start = time()
        if lda:
            topics = get_topics_lda(models[v], HAS_e[v]['id2word'])
            texts = HAS_e[v]['T']
            dictionary = models[v].id2word
        else:
            topics = get_topics_bertopic(bertopics[v])
            analyzer = models[v].vectorizer_model.build_analyzer()
            texts = [analyzer(doc) for doc in HAS_e[v]['text']]
            dictionary = Dictionary(texts)
        c = get_coherence(
            topics=topics,
            texts=texts,
            dictionary=dictionary,
        )
        d = get_diversity(topics)
        t_end = time()
        r_time.append(t_end - t_start)
        r.append({
            'variant': v,
            'coherence': c,
            'diversity': d,
            'score': c * d
        })
    algorithm = 'lda' if lda else 'bertopic'
    HAS_t[f'{algorithm}_evaluation'] = pd.Series(r_time)
    return pd.DataFrame(r)

In [12]:
import pandas as pd

evaluation_lda = pd.DataFrame(evaluate_models(ldas, lda=True))

In [13]:
evaluation_bertopic = pd.DataFrame(evaluate_models(bertopics))

In [14]:
HAS_t

Unnamed: 0,variant,segmentation,preprocess,lda_embedding,lda_training,bertopic_training,lda_evaluation,bertopic_evaluation
0,HT,0.005998611,422.800828,0.006001,23.647413,21.27378,3.135896,1.658663
1,HCL,0.005998611,422.799833,0.003976,22.379721,9.136819,2.581465,1.736873
2,HCLW,0.005998611,422.800833,0.004001,25.818747,9.151057,8.28134,1.604785
3,HCLWN,0.005998611,422.812833,0.003005,22.219262,8.910228,2.929007,1.561921
4,AT,1692424000.0,422.819834,0.091998,33.566865,20.32113,6.854658,3.908763
5,ACL,1692424000.0,422.831834,0.081002,33.911662,20.947009,5.51877,3.916295
6,ACLW,1692424000.0,422.838855,0.039994,28.696265,19.44947,5.15884,3.519449
7,ACLWN,1692424000.0,423.086812,0.048004,32.202545,18.919627,5.18817,3.520646
8,ST,42.04551,422.824833,0.110003,36.66292,94.256775,3.911486,4.610502
9,SCL,42.04551,422.838843,0.107988,37.488737,90.736179,4.243901,4.380032


In [15]:
evaluation_lda

Unnamed: 0,variant,coherence,diversity,score
0,HT,0.425508,0.690909,0.293987
1,HCL,0.444845,0.726923,0.323368
2,HCLW,0.469844,0.513793,0.241403
3,HCLWN,0.57447,0.382927,0.21998
4,AT,0.357058,0.130208,0.046492
5,ACL,0.344275,0.137037,0.047178
6,ACLW,0.358567,0.257692,0.0924
7,ACLWN,0.362153,0.319737,0.115794
8,ST,0.429257,0.117241,0.050327
9,SCL,0.422644,0.103571,0.043774


In [16]:
evaluation_bertopic

Unnamed: 0,variant,coherence,diversity,score
0,HT,0.478207,0.9,0.430386
1,HCL,0.472179,0.95,0.44857
2,HCLW,0.463233,1.0,0.463233
3,HCLWN,0.344389,1.0,0.344389
4,AT,0.375608,0.7,0.262926
5,ACL,0.37704,0.7,0.263928
6,ACLW,0.476604,0.9,0.428943
7,ACLWN,0.567167,0.9,0.51045
8,ST,0.638747,0.916,0.585092
9,SCL,0.588247,0.894,0.525893
