In [23]:
import pickle

with open('./datasets/small/embeddings/e_H.pickle', 'rb') as f:
    e_H = pickle.load(f)
with open('./datasets/small/embeddings/e_A.pickle', 'rb') as f:
    e_A = pickle.load(f)
with open('./datasets/small/embeddings/e_S.pickle', 'rb') as f:
    e_S = pickle.load(f)

In [24]:
from bertopic import BERTopic
import pandas as pd

def load_ldas(variant):
    r = {} # result
    for v in variant:
        with open(f'./datasets/small/models_lda/{v}.pickle', 'rb') as f:
            r[v] = pickle.load(f)
    return r

def load_bertopics(variant):
    path_prefix = './datasets/small/models_bertopic/'
    r = {}
    for v in variant:
        r[v] = {}
        r[v]['model'] = BERTopic.load(f'{path_prefix}{v}')
        r[v]['time'] = pd.read_csv(f'{path_prefix}{v}/time.csv')['time'][0]
    return r

In [25]:
def e_variant():
    # dataset = ['H', 'A', 'S']
    dataset = ['H', 'A']
    # e_variant = ['T', 'C', 'CL', 'CLW', 'CW', 'L', 'LW', 'W']
    variant = ['T', 'CLW']
    return [f'{e}{v}' for e in dataset for v in variant]

ldas = load_ldas(e_variant())
bertopics = load_bertopics(e_variant())

In [26]:
from utils import get_topics_lda, get_topics_bertopic, get_diversity, get_coherence

def find_embeddings(v):
    if v in e_H:
        embeddings = e_H
    if v in e_A:
        embeddings = e_A
    if v in e_S:
        embeddings = e_S
    return embeddings

def evaluate_ldas(ldas):
    r = [] # result
    for v in ldas:
        embeddings = find_embeddings(v)
        topics = get_topics_lda(ldas[v]['model'], embeddings[v]['id2word'])
        r.append({
            'variant': v,
            'training_time': ldas[v]['time'],
            'coherence': get_coherence(
                topics=topics,
                texts=embeddings[v]['T'],
                dictionary=embeddings[v]['id2word']
            ),
            'diversity': get_diversity(topics)
        })
    return r

def evaluate_bertopics(bertopics):
    r = [] # result
    for v in bertopics:
        embeddings = find_embeddings(v)
        topics = get_topics_bertopic(bertopics[v]['model'])
        r.append({
            'variant': v,
            'training_time': bertopics[v]['time'],
            'coherence': get_coherence(
                topics=topics,
                texts=embeddings[v]['T'],
                dictionary=embeddings[v]['id2word']
            ),
            'diversity': get_diversity(topics)
        })
    return r

In [27]:
evaluation_ldas = pd.DataFrame(evaluate_ldas(ldas))
evaluation_bertopics = pd.DataFrame(evaluate_bertopics(bertopics))

HT
HCLW
AT
ACLW
HT
HCLW
AT
ACLW


In [28]:
evaluation_ldas

Unnamed: 0,variant,training_time,coherence,diversity
0,HT,19.499277,0.510733,0.62
1,HCLW,22.788054,0.416104,0.816667
2,AT,27.645021,0.31224,0.109091
3,ACLW,31.318855,0.348385,0.364179


In [29]:
evaluation_bertopics

Unnamed: 0,variant,training_time,coherence,diversity
0,HT,8.07888,0.832488,0.94
1,HCLW,8.611414,0.318059,1.0
2,AT,24.725611,0.338568,0.633333
3,ACLW,26.733876,0.599488,0.95
