In [1]:
import pandas as pd

df_short = pd.read_csv('./datasets/df_short.csv')
df_long = pd.read_csv('./datasets/df_long.csv')

In [2]:
# df_short = df_short.sample(100, random_state=99)
# df_long = df_long.sample(100, random_state=99)

In [3]:
from gensim.corpora.dictionary import Dictionary
from utils import tokenize

def build_dataset(df, column_name):
    result = {}
    result['name'] = column_name
    result['tokenized_texts'] = tokenize(df[column_name])
    result['dictionary'] = Dictionary(result['tokenized_texts'])
    result['corpus'] = [result['dictionary'].doc2bow(s) for s in result['tokenized_texts']]
    return result

def get_topics(lda, dictionary):
    k = lda.num_topics
    result = []
    for i in range(k):
        terms = lda.get_topic_terms(i)
        ki = [dictionary[t[0]] for t in terms]
        result.append(ki)
    return { 'topics': result }

In [4]:
dataset_short = build_dataset(df_short, 'short')
dataset_short_no_sw = build_dataset(df_short, 'short_no_sw')
dataset_short_no_sw_ngram = build_dataset(df_short, 'short_no_sw_ngram')
dataset_long = build_dataset(df_long, 'long')
dataset_long_no_sw = build_dataset(df_long, 'long_no_sw')
dataset_long_no_sw_ngram = build_dataset(df_long, 'long_no_sw_ngram')

In [5]:
# dataset_short
# dataset_short_no_sw
# dataset_short_no_sw_ngram
# dataset_long
# dataset_long_no_sw
# dataset_long_no_sw_ngram

In [6]:
from gensim.models.ldamulticore import LdaMulticore
from octis.evaluation_metrics.coherence_metrics import Coherence
import optuna

def create_lda(d):
    def lda(trial):
        num_topics = trial.suggest_int('num_topics', 5, 40)
        alpha = trial.suggest_categorical('alpha_categorical', ['symmetric', 'asymmetric', 'scalar'])
        eta = trial.suggest_categorical('eta_categorical', ['symmetric', 'scalar'])
        if alpha == 'scalar':
            alpha = trial.suggest_float('alpha', 0.001, 1)
        if eta == 'scalar':
            eta = trial.suggest_float('eta', 0.001, 1)
        return LdaMulticore(d['corpus'], num_topics, d['dictionary'], alpha=alpha, eta=eta, random_state=99)
    return lda

def create_objective(d):
    def objective(trial):
        lda = create_lda(d)(trial)
        tc = Coherence(texts=d['tokenized_texts'], measure='c_v')
        tc_score = tc.score(get_topics(lda, d['dictionary']))
        return tc_score
    return objective


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def get_best_trial(dataset):
    study = optuna.create_study(direction='maximize')
    study.optimize(create_objective(dataset), n_trials=20)
    return study.best_trial

def get_best_lda(dataset):
    best_trial = get_best_trial(dataset)
    lda = create_lda(dataset)(best_trial)
    return lda

In [8]:
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import time

def evaluate_lda(dataset):
    start = time.time()
    lda = get_best_lda(dataset)
    topics = get_topics(lda, dataset['dictionary'])
    tc = Coherence(dataset['tokenized_texts'], measure='c_v')
    td = TopicDiversity()
    return {
        'dataset': dataset['name'],
        'elapse_time': time.time() - start,
        'coherence_score': tc.score(topics),
        'diversity_score': td.score(topics)
    }

In [9]:
lda_evaluation = pd.DataFrame([
    evaluate_lda(dataset_short),
    evaluate_lda(dataset_short_no_sw),
    evaluate_lda(dataset_short_no_sw_ngram),
    evaluate_lda(dataset_long),
    evaluate_lda(dataset_long_no_sw),
    evaluate_lda(dataset_long_no_sw_ngram),
])

[I 2023-08-08 11:27:49,253] A new study created in memory with name: no-name-0f62c1d5-7132-49c2-9a96-cd4276e3a2a3
[I 2023-08-08 11:28:47,973] Trial 0 finished with value: 0.4845828155390037 and parameters: {'num_topics': 23, 'alpha_categorical': 'symmetric', 'eta_categorical': 'symmetric'}. Best is trial 0 with value: 0.4845828155390037.
[I 2023-08-08 11:29:45,416] Trial 1 finished with value: 0.4639980553048674 and parameters: {'num_topics': 36, 'alpha_categorical': 'symmetric', 'eta_categorical': 'scalar', 'eta': 0.8621157310351236}. Best is trial 0 with value: 0.4845828155390037.
[I 2023-08-08 11:30:31,281] Trial 2 finished with value: 0.4077356650205924 and parameters: {'num_topics': 40, 'alpha_categorical': 'scalar', 'eta_categorical': 'symmetric', 'alpha': 0.7109916597934982}. Best is trial 0 with value: 0.4845828155390037.
[I 2023-08-08 11:31:22,068] Trial 3 finished with value: 0.5253537135897393 and parameters: {'num_topics': 19, 'alpha_categorical': 'asymmetric', 'eta_categor

In [10]:
# lda_evaluation.to_csv('./results/lda_evaluation.csv')
lda_evaluation

Unnamed: 0,dataset,elapse_time,coherence_score,diversity_score
0,short,1108.221275,0.525361,0.684211
1,short_no_sw,793.591307,0.551511,0.817241
2,short_no_sw_ngram,710.703611,0.425615,0.82
3,long,1290.741158,0.346353,0.1
4,long_no_sw,684.014806,0.414956,0.163636
5,long_no_sw_ngram,574.258045,0.372561,0.159091
