In [11]:
import pandas as pd

df = pd.read_csv('./datasets/df.csv')

In [12]:
df = df.sample(100)

In [13]:
from gensim.corpora.dictionary import Dictionary
from utils import tokenize

def build_dataset(df, column_name):
    result = {}
    result['name'] = column_name
    result['tokenized_texts'] = tokenize(df[column_name])
    result['dictionary'] = Dictionary(result['tokenized_texts'])
    result['corpus'] = [result['dictionary'].doc2bow(s) for s in result['tokenized_texts']]
    return result

def get_topics(lda, dictionary):
    k = lda.num_topics
    result = []
    for i in range(k):
        terms = lda.get_topic_terms(i)
        ki = [dictionary[t[0]] for t in terms]
        result.append(ki)
    return { 'topics': result }

In [14]:
dataset_short_raw = build_dataset(df, 'short_raw')
dataset_long_raw = build_dataset(df, 'long_raw')
dataset_short = build_dataset(df, 'short')
dataset_long = build_dataset(df, 'long')

In [15]:
from gensim.models.ldamulticore import LdaMulticore
from octis.evaluation_metrics.coherence_metrics import Coherence
import optuna

def create_lda(d):
    def lda(trial):
        num_topics = trial.suggest_int('num_topics', 5, 100)
        alpha = trial.suggest_float('alpha', 0.001, 1)
        eta = trial.suggest_float('eta', 0.001, 1)
        return LdaMulticore(d['corpus'], num_topics, d['dictionary'], alpha=alpha, eta=eta, random_state=99)
    return lda

def create_objective(d):
    def objective(trial):
        lda = create_lda(d)(trial)
        tc = Coherence(texts=d['tokenized_texts'], measure='c_v')
        tc_score = tc.score(get_topics(lda, d['dictionary']))
        return tc_score
    return objective


In [16]:
def get_best_trial(dataset):
    study = optuna.create_study(direction='maximize')
    study.optimize(create_objective(dataset), n_trials=2)
    return study.best_trial

def get_best_lda(dataset):
    best_trial = get_best_trial(dataset)
    lda = create_lda(dataset)(best_trial)
    return lda

In [17]:
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import time

def evaluate_lda(dataset):
    start = time.time()
    lda = get_best_lda(dataset)
    topics = get_topics(lda, dataset['dictionary'])
    tc = Coherence(dataset['tokenized_texts'], measure='c_v')
    td = TopicDiversity()
    return {
        'dataset': dataset['name'],
        'elapse_time': time.time() - start,
        'coherence_score': tc.score(topics),
        'diversity_score': td.score(topics)
    }

In [18]:
lda_evaluation = [
    evaluate_lda(dataset_short_raw),
    evaluate_lda(dataset_long_raw),
    evaluate_lda(dataset_short),
    evaluate_lda(dataset_long),
]

[I 2023-08-07 10:06:09,297] A new study created in memory with name: no-name-43347d20-42aa-4b93-be3a-5b54e00d5404


[I 2023-08-07 10:06:10,138] Trial 0 finished with value: 0.325855386775041 and parameters: {'num_topics': 78, 'alpha': 0.5546556288289214, 'eta': 0.22830578967829662}. Best is trial 0 with value: 0.325855386775041.
[I 2023-08-07 10:06:10,899] Trial 1 finished with value: 0.3275929614296654 and parameters: {'num_topics': 78, 'alpha': 0.5252980289357414, 'eta': 0.20763160928906513}. Best is trial 1 with value: 0.3275929614296654.
[I 2023-08-07 10:06:11,668] A new study created in memory with name: no-name-9119dc7d-141e-42f5-8f92-3c65c370c4d9
[I 2023-08-07 10:06:15,123] Trial 0 finished with value: 0.3389515593334336 and parameters: {'num_topics': 95, 'alpha': 0.35002645409204075, 'eta': 0.1265329151340699}. Best is trial 0 with value: 0.3389515593334336.
[I 2023-08-07 10:06:17,935] Trial 1 finished with value: 0.34221352353166207 and parameters: {'num_topics': 98, 'alpha': 0.9562680424055888, 'eta': 0.7792445781500905}. Best is trial 1 with value: 0.34221352353166207.
[I 2023-08-07 10:06

In [19]:
lda_evaluation_df = pd.DataFrame(lda_evaluation)
# lda_evaluation_df.to_csv('./results/lda_evaluation.csv', index=False)

In [20]:
lda_evaluation_df

Unnamed: 0,dataset,elapse_time,coherence_score,diversity_score
0,short_raw,1.724713,0.327593,0.023077
1,long_raw,8.320421,0.342214,0.011224
2,short,1.785719,0.5455,0.055357
3,long,4.044739,0.375282,0.022353
