In [22]:
import pandas as pd

df = pd.read_csv('./datasets/df.csv')


In [23]:
df = df.sample(100)


In [24]:
def get_topics(bertopic):
    result = bertopic.get_topics().values()
    result = [[w[0] for w in t] for t in result]
    return { 'topics': result }


In [25]:
import time

from bertopic import BERTopic
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

from utils import tokenize


def evaluate_bertopic(df, column_name):
    start = time.time()
    model = BERTopic().fit(df[column_name])
    topics = get_topics(model)
    tc = Coherence(texts=tokenize(df[column_name]), measure='c_v')
    td = TopicDiversity()
    return {
        'dataset': column_name,
        'elapse_time': time.time() - start,
        'coherence_score': tc.score(topics),
        'diversity_score': td.score(topics)
    }

In [26]:
bertopic_evaluation = pd.DataFrame([
    evaluate_bertopic(df, 'short_raw'),
    evaluate_bertopic(df, 'long_raw'),
    evaluate_bertopic(df, 'short'),
    evaluate_bertopic(df, 'long'),
])

In [None]:
bertopic_evaluation.to_csv('./results/bertopic_evaluation.csv', index=False)

In [27]:
bertopic_evaluation

Unnamed: 0,dataset,elapse_time,coherence_score,diversity_score
0,short_raw,3.979897,0.33946,1.0
1,long_raw,12.219148,0.336528,1.0
2,short,3.69423,0.577694,1.0
3,long,13.095908,0.302297,1.0
