In [1]:
import pandas as pd

df_short = pd.read_csv('./datasets/df_short.csv')
df_long = pd.read_csv('./datasets/df_long.csv')

In [2]:
# df_short = df_short.sample(100, random_state=99)
# df_long = df_long.sample(100, random_state=99)

In [3]:
from gensim.corpora.dictionary import Dictionary
from utils import tokenize

def build_dataset(df, column_name):
    result = {}
    result['name'] = column_name
    result['tokenized_texts'] = tokenize(df[column_name])
    result['dictionary'] = Dictionary(result['tokenized_texts'])
    result['corpus'] = [result['dictionary'].doc2bow(s) for s in result['tokenized_texts']]
    return result

def get_topics(lda, dictionary):
    k = lda.num_topics
    result = []
    for i in range(k):
        terms = lda.get_topic_terms(i)
        ki = [dictionary[t[0]] for t in terms]
        result.append(ki)
    return { 'topics': result }

In [4]:
dataset_short = build_dataset(df_short, 'short')
dataset_short_no_sw = build_dataset(df_short, 'short_no_sw')
dataset_short_no_sw_ngram = build_dataset(df_short, 'short_no_sw_ngram')
dataset_long = build_dataset(df_long, 'long')
dataset_long_no_sw = build_dataset(df_long, 'long_no_sw')
dataset_long_no_sw_ngram = build_dataset(df_long, 'long_no_sw_ngram')

In [5]:
# dataset_short
# dataset_short_no_sw
# dataset_short_no_sw_ngram
# dataset_long
# dataset_long_no_sw
# dataset_long_no_sw_ngram

In [6]:
from gensim.models.ldamulticore import LdaMulticore
from octis.evaluation_metrics.coherence_metrics import Coherence
import optuna

def create_lda(d):
    def lda(trial):
        num_topics = trial.suggest_int('num_topics', 5, 40)
        alpha = trial.suggest_categorical('alpha_categorical', ['symmetric', 'asymmetric', 'scalar'])
        eta = trial.suggest_categorical('eta_categorical', ['symmetric', 'scalar'])
        if alpha == 'scalar':
            alpha = trial.suggest_float('alpha', 0.001, 1)
        if eta == 'scalar':
            eta = trial.suggest_float('eta', 0.001, 1)
        return LdaMulticore(d['corpus'], num_topics, d['dictionary'], alpha=alpha, eta=eta, random_state=99)
    return lda

def create_objective(d):
    def objective(trial):
        lda = create_lda(d)(trial)
        tc = Coherence(texts=d['tokenized_texts'], measure='c_v')
        tc_score = tc.score(get_topics(lda, d['dictionary']))
        return tc_score
    return objective


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def get_best_trial(dataset):
    study = optuna.create_study(direction='maximize')
    study.optimize(create_objective(dataset), n_trials=20)
    return study.best_trial

def get_best_lda(dataset):
    best_trial = get_best_trial(dataset)
    lda = create_lda(dataset)(best_trial)
    return lda

In [8]:
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import time

def evaluate_lda(dataset):
    start = time.time()
    lda = get_best_lda(dataset)
    topics = get_topics(lda, dataset['dictionary'])
    tc = Coherence(dataset['tokenized_texts'], measure='c_v')
    td = TopicDiversity()
    return {
        'dataset': dataset['name'],
        'elapse_time': time.time() - start,
        'coherence_score': tc.score(topics),
        'diversity_score': td.score(topics)
    }

In [9]:
lda_evaluation = pd.DataFrame([
    evaluate_lda(dataset_short),
    evaluate_lda(dataset_short_no_sw),
    evaluate_lda(dataset_short_no_sw_ngram),
    evaluate_lda(dataset_long),
    evaluate_lda(dataset_long_no_sw),
    evaluate_lda(dataset_long_no_sw_ngram),
])

[I 2023-08-08 18:31:00,282] A new study created in memory with name: no-name-e822db33-f751-4ae9-9270-8651b4b8d495


[I 2023-08-08 18:32:15,488] Trial 0 finished with value: 0.44871791682364237 and parameters: {'num_topics': 9, 'alpha_categorical': 'asymmetric', 'eta_categorical': 'symmetric'}. Best is trial 0 with value: 0.44871791682364237.
[I 2023-08-08 18:33:36,019] Trial 1 finished with value: 0.4511914566639019 and parameters: {'num_topics': 12, 'alpha_categorical': 'scalar', 'eta_categorical': 'symmetric', 'alpha': 0.46179603779944806}. Best is trial 1 with value: 0.4511914566639019.
[I 2023-08-08 18:34:45,201] Trial 2 finished with value: 0.49580002413331625 and parameters: {'num_topics': 21, 'alpha_categorical': 'symmetric', 'eta_categorical': 'symmetric'}. Best is trial 2 with value: 0.49580002413331625.
[I 2023-08-08 18:36:02,578] Trial 3 finished with value: 0.48680948340149693 and parameters: {'num_topics': 36, 'alpha_categorical': 'scalar', 'eta_categorical': 'symmetric', 'alpha': 0.39803831218212027}. Best is trial 2 with value: 0.49580002413331625.
[I 2023-08-08 18:37:14,262] Trial 4 

In [10]:
lda_evaluation.to_csv('./results/lda_evaluation.csv')
lda_evaluation

Unnamed: 0,dataset,elapse_time,coherence_score,diversity_score
0,short,1455.07284,0.53108,0.519231
1,short_no_sw,1042.684266,0.558469,0.868182
2,short_no_sw_ngram,1053.043575,0.456649,0.8
3,long,1758.18679,0.345782,0.078261
4,long_no_sw,829.289281,0.412297,0.190625
5,long_no_sw_ngram,699.489569,0.373327,0.171429
