# LDA

In [10]:
import pandas as pd

ds = pd.read_parquet('./results/ds.parquet')
elapse_time = pd.read_csv('./results/elapse_time.csv')

In [11]:
from time import time

from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from utils import get_diversity, get_topics_lda
import optuna

def lda(docs):
    docs = docs.dropna()
    dictionary = Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    def objective(trial, get_lda=False):
        num_topics = trial.suggest_int('num_topics', 5, 100)
        alpha = trial.suggest_categorical('alpha_categorical', ['symmetric', 'asymmetric', 'scalar'])
        eta = trial.suggest_categorical('eta_categorical', ['symmetric', 'auto', 'scalar'])
        if alpha == 'scalar':
            alpha = trial.suggest_float('alpha', 0.01, 1)
        if eta == 'scalar':
            eta = trial.suggest_float('eta', 0.01, 1)
        model = LdaMulticore(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            alpha=alpha,
            eta=eta,
        )
        if get_lda:
            return model
        c = CoherenceModel(model, texts=docs, dictionary=dictionary, coherence='c_v')
        cs = c.get_coherence()
        ts = get_diversity(get_topics_lda(model, dictionary))
        return cs * ts

    t_start = time()
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=2) # FIXME: change n to 20

    best_trial = study.best_trial
    best_lda = objective(best_trial, get_lda=True)
    t_end = time()

    return {
        'model': best_lda,
        'time': t_end - t_start,
    }

In [12]:
def save_models_lda():
    r_time = []
    for v in ds:
        model = lda(ds[v])
        model['model'].save(f'./results/models_lda/{v}')
        r_time.append(model['time'])
    elapse_time['lda_training'] = pd.Series(r_time)

In [13]:
save_models_lda()

[I 2023-08-21 17:05:54,509] A new study created in memory with name: no-name-25a1a112-59cb-4c54-ba31-c94a431a6c74


[I 2023-08-21 17:06:09,236] Trial 0 finished with value: 0.06607801894203869 and parameters: {'num_topics': 42, 'alpha_categorical': 'asymmetric', 'eta_categorical': 'auto'}. Best is trial 0 with value: 0.06607801894203869.
[I 2023-08-21 17:06:22,558] Trial 1 finished with value: 0.0686203668838986 and parameters: {'num_topics': 39, 'alpha_categorical': 'asymmetric', 'eta_categorical': 'auto'}. Best is trial 1 with value: 0.0686203668838986.
[I 2023-08-21 17:06:29,161] A new study created in memory with name: no-name-73a3e0f9-85f0-4503-b337-076568fec71f
[I 2023-08-21 17:06:44,487] Trial 0 finished with value: 0.04065546315271756 and parameters: {'num_topics': 81, 'alpha_categorical': 'asymmetric', 'eta_categorical': 'scalar', 'eta': 0.17103317918635216}. Best is trial 0 with value: 0.04065546315271756.
[I 2023-08-21 17:06:55,373] Trial 1 finished with value: 0.024160334646465786 and parameters: {'num_topics': 19, 'alpha_categorical': 'asymmetric', 'eta_categorical': 'scalar', 'eta': 0.

In [14]:
elapse_time

Unnamed: 0,variant,nlp,preprocessing,lda_training
0,dCL,34.678537,0.311998,34.611548
1,dLWN,34.678537,0.311998,31.916765
2,DCL,34.678537,0.311998,30.490422
3,DLWN,34.678537,0.311998,34.266039


# BERTopic

In [15]:
from time import time

from bertopic import BERTopic
import pandas as pd

embedding_model = "paraphrase-multilingual-MiniLM-L12-v2"

def save_models_bertopic():
    path_prefix = './results/models_bertopic/'
    r_time = []
    for v in ds:
        t_start = time()
        docs = ds[v].dropna()
        docs = [' '.join(doc) for doc in docs]
        bertopic = BERTopic(language='multilingual', embedding_model=embedding_model)
        model = bertopic.fit(docs)
        t_end = time()
        model.save(
            f"{path_prefix}{v}",
            serialization="safetensors",
            save_embedding_model=embedding_model,
            save_ctfidf=True,
        )
        r_time.append(t_end - t_start)
    elapse_time['bertopic_training'] = pd.Series(r_time)

In [16]:
save_models_bertopic()

In [17]:
elapse_time

Unnamed: 0,variant,nlp,preprocessing,lda_training,bertopic_training
0,dCL,34.678537,0.311998,34.611548,74.961021
1,dLWN,34.678537,0.311998,31.916765,15.938241
2,DCL,34.678537,0.311998,30.490422,37.874624
3,DLWN,34.678537,0.311998,34.266039,11.792914


In [18]:
elapse_time.to_csv('./results/elapse_time.csv', index=False)