In [84]:
import pandas as pd

df = pd.read_csv('./datasets/df.csv')

In [85]:
df = df.sample(100)

In [86]:
from gensim.corpora.dictionary import Dictionary

def tokenize(df):
    return [s.split(' ') for s in df]

def build_dataset(df, column_name):
    result = {}
    result['name'] = column_name
    result['tokenized_texts'] = tokenize(df[column_name])
    result['dictionary'] = Dictionary(result['tokenized_texts'])
    result['corpus'] = [result['dictionary'].doc2bow(s) for s in result['tokenized_texts']]
    return result

def topics(lda, dictionary):
    k = lda.num_topics
    result = []
    for i in range(k):
        terms = lda.get_topic_terms(i)
        ki = [dictionary[t[0]] for t in terms]
        result.append(ki)
    return { 'topics': result }

In [87]:
dataset_short_raw = build_dataset(df, 'short_raw')
dataset_long_raw = build_dataset(df, 'long_raw')
dataset_short = build_dataset(df, 'short')
dataset_long = build_dataset(df, 'long')

In [88]:
from gensim.models.ldamulticore import LdaMulticore
from octis.evaluation_metrics.coherence_metrics import Coherence
import optuna

def create_lda(d):
    def lda(trial):
        num_topics = trial.suggest_int('num_topics', 5, 100)
        alpha = trial.suggest_float('alpha', 0.001, 1)
        eta = trial.suggest_float('eta', 0.001, 1)
        return LdaMulticore(d['corpus'], num_topics, d['dictionary'], alpha=alpha, eta=eta, random_state=99)
    return lda

def create_objective(d):
    def objective(trial):
        lda = create_lda(d)(trial)
        tc = Coherence(texts=d['tokenized_texts'], measure='c_v')
        tc_score = tc.score(topics(lda, d['dictionary']))
        return tc_score
    return objective


In [89]:
def get_best_trial(dataset):
    study = optuna.create_study(direction='maximize')
    study.optimize(create_objective(dataset), n_trials=2)
    return study.best_trial

def get_best_lda(dataset):
    best_trial = get_best_trial(dataset)
    lda = create_lda(dataset)(best_trial)
    return lda

In [90]:
lda_short_raw = get_best_lda(dataset_short_raw)
lda_long_raw = get_best_lda(dataset_long_raw)
lda_short = get_best_lda(dataset_short)
lda_long = get_best_lda(dataset_long)

[I 2023-08-07 09:06:48,482] A new study created in memory with name: no-name-5a0f3cc1-fa74-4254-882c-cfa1c1aa5f06
[I 2023-08-07 09:06:48,962] Trial 0 finished with value: 0.3455066340633149 and parameters: {'num_topics': 27, 'alpha': 0.7121709176406774, 'eta': 0.06767307422724697}. Best is trial 0 with value: 0.3455066340633149.
[I 2023-08-07 09:06:49,302] Trial 1 finished with value: 0.3472274441488347 and parameters: {'num_topics': 18, 'alpha': 0.6909705913141139, 'eta': 0.520860690538879}. Best is trial 1 with value: 0.3472274441488347.
[I 2023-08-07 09:06:49,458] A new study created in memory with name: no-name-f79c3c14-aba2-4c4c-9dc1-30ecb472c1f8
[I 2023-08-07 09:06:50,729] Trial 0 finished with value: 0.3220898129821048 and parameters: {'num_topics': 9, 'alpha': 0.12483532975643523, 'eta': 0.9835803091086519}. Best is trial 0 with value: 0.3220898129821048.
[I 2023-08-07 09:06:52,563] Trial 1 finished with value: 0.3238114049669333 and parameters: {'num_topics': 33, 'alpha': 0.65

In [91]:
topics_short_raw = topics(lda_short_raw, dataset_short_raw['dictionary'])
topics_long_raw = topics(lda_long_raw, dataset_long_raw['dictionary'])
topics_short = topics(lda_short, dataset_short['dictionary'])
topics_long = topics(lda_long, dataset_long['dictionary'])

In [96]:
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import time

def evaluate_lda(dataset):
    start = time.time()
    lda = get_best_lda(dataset)
    top_words = topics(lda, dataset['dictionary'])
    tc = Coherence(dataset['tokenized_texts'], measure='c_v')
    td = TopicDiversity()
    return {
        'dataset': dataset['name'],
        'elapse_time': time.time() - start,
        'coherence_score': tc.score(top_words),
        'diversity_score': td.score(top_words)
    }

In [106]:
lda_evaluation = [
    evaluate_lda(dataset_short_raw),
    evaluate_lda(dataset_long_raw),
    evaluate_lda(dataset_short),
    evaluate_lda(dataset_long),
]

[I 2023-08-07 09:17:40,130] A new study created in memory with name: no-name-26e4af12-63dd-4796-af21-8df77727aca1
[I 2023-08-07 09:17:40,903] Trial 0 finished with value: 0.34759894608259123 and parameters: {'num_topics': 71, 'alpha': 0.9437656212326068, 'eta': 0.1518932833319723}. Best is trial 0 with value: 0.34759894608259123.
[I 2023-08-07 09:17:41,564] Trial 1 finished with value: 0.35025966964777183 and parameters: {'num_topics': 69, 'alpha': 0.9379596337737225, 'eta': 0.7546015458893324}. Best is trial 1 with value: 0.35025966964777183.
[I 2023-08-07 09:17:42,167] A new study created in memory with name: no-name-bc2a7de7-dc60-4caf-b166-032fdf2b2368
[I 2023-08-07 09:17:45,075] Trial 0 finished with value: 0.32662537361207317 and parameters: {'num_topics': 60, 'alpha': 0.08774514572388033, 'eta': 0.08763545443747113}. Best is trial 0 with value: 0.32662537361207317.
[I 2023-08-07 09:17:47,507] Trial 1 finished with value: 0.3222324464722678 and parameters: {'num_topics': 34, 'alph

In [107]:
lda_evaluation_df = pd.DataFrame(lda_evaluation)
lda_evaluation_df.to_csv('./results/lda_evaluation.csv', index=False)

In [108]:
lda_evaluation_df

Unnamed: 0,dataset,elapse_time,coherence_score,diversity_score
0,short_raw,1.55935,0.35026,0.02029
1,long_raw,6.92933,0.326625,0.105
2,short,1.234416,0.503736,0.038636
3,long,6.049585,0.384452,0.364179
