In [1]:
PATH_FILE_PREPROCESSED = './results/ds.parquet'
PATH_FILE_ELAPSE_TIME = './results/elapse_time.csv'
PATH_PREFIX_MODEL_LDA = './results/models_lda/'
PATH_PREFIX_MODEL_BERTOPIC = './results/models_bertopic/'
LDA_N_TRIAL = 3 # FIXME: should be 100 fr fr
USE_GPU = True

In [2]:
import pandas as pd

ds = pd.read_parquet(PATH_FILE_PREPROCESSED)
elapse_time = pd.read_csv(PATH_FILE_ELAPSE_TIME)

# LDA

In [3]:
from time import time

from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from utils import get_diversity, get_topics_lda
import optuna

# optuna.logging.set_verbosity(optuna.logging.WARNING)

def lda(docs):
    docs = docs.dropna()
    dictionary = Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    def objective(trial, get_lda=False):
        num_topics = trial.suggest_int('num_topics', 5, 100)
        alpha = trial.suggest_categorical('alpha_categorical', ['symmetric', 'asymmetric', 'scalar'])
        eta = trial.suggest_categorical('eta_categorical', ['symmetric', 'auto', 'scalar'])
        if alpha == 'scalar':
            alpha = trial.suggest_float('alpha', 0.01, 1)
        if eta == 'scalar':
            eta = trial.suggest_float('eta', 0.01, 1)
        model = LdaMulticore(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            alpha=alpha,
            eta=eta,
        )
        if get_lda:
            return model
        c = CoherenceModel(model, texts=docs, dictionary=dictionary, coherence='c_v')
        cs = c.get_coherence()
        ts = get_diversity(get_topics_lda(model, dictionary))
        return cs * ts

    t_start = time()
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=LDA_N_TRIAL)

    best_trial = study.best_trial
    best_lda = objective(best_trial, get_lda=True)
    t_end = time()

    return {
        'model': best_lda,
        'time': t_end - t_start,
    }

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [4]:
from tqdm import tqdm

def save_models_lda():
    r_time = []
    for v in (tds := tqdm(ds.columns)):
        tds.set_description(f'Training {v}')
        if 'B' in v: 
            r_time.append(0)
            continue
        if 'G' in v:
            r_time.append(0)
            continue
        model = lda(ds[v])
        model['model'].save(f'{PATH_PREFIX_MODEL_LDA}{v}')
        r_time.append(model['time'])
    elapse_time['lda_training'] = pd.Series(r_time)

In [5]:
save_models_lda()

Training dC:   0%|          | 0/110 [00:00<?, ?it/s][I 2023-09-05 14:03:59,973] A new study created in memory with name: no-name-cf13343d-df66-4419-a990-d55d6e976da6
[I 2023-09-05 14:04:15,221] Trial 0 finished with value: 0.021435612105767315 and parameters: {'num_topics': 70, 'alpha_categorical': 'symmetric', 'eta_categorical': 'auto'}. Best is trial 0 with value: 0.021435612105767315.
[I 2023-09-05 14:04:22,009] Trial 1 finished with value: 0.08314195842097628 and parameters: {'num_topics': 10, 'alpha_categorical': 'symmetric', 'eta_categorical': 'auto'}. Best is trial 1 with value: 0.08314195842097628.
[I 2023-09-05 14:04:33,336] Trial 2 finished with value: 0.014679326428669622 and parameters: {'num_topics': 60, 'alpha_categorical': 'asymmetric', 'eta_categorical': 'scalar', 'eta': 0.3657739122398628}. Best is trial 1 with value: 0.08314195842097628.
Training DC:   3%|▎         | 3/110 [00:40<24:11, 13.56s/it][I 2023-09-05 14:04:39,438] A new study created in memory with name: no-

In [None]:
elapse_time

Unnamed: 0,variant,tokenizing,preprocessing,lda_training,bertopic_training,lda_evaluation,bertopic_evaluation
0,dLG,2.061464,0.076667,0.0,9.829835,0.522905,5.086215
1,DLG,2.061464,0.076667,0.0,2.951725,0.600327,4.920509
2,dSP,2.061464,2.339595,1.55544,5.861379,0.70851,1.559066
3,DSP,2.061464,2.743002,6.910429,1.80424,0.636789,1.543857
4,dLN,2.061464,0.080067,1.499174,5.048638,0.488897,2.60563
5,DLN,2.061464,0.080067,9.134171,1.850412,0.789871,1.398466
6,dLW,2.061464,0.155874,1.580441,5.598723,0.499317,1.545107
7,DLW,2.061464,0.155875,6.300422,1.933671,0.439239,1.526137
8,dB,2.061464,0.006221,0.0,5.644242,0.0,1.675596
9,DB,2.061464,0.006221,0.0,1.863791,0.0,1.451739


# BERTopic

In [None]:
from time import time

import pandas as pd
from bertopic import BERTopic
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

if USE_GPU:
    from cuml.cluster import HDBSCAN
    from cuml.manifold import UMAP


def save_models_bertopic():
    r_time = []
    for v in (tds := tqdm(ds.columns)):
        tds.set_description(f'Training {v}')
        if 'T' in v: 
            r_time.append(0)
            continue
        t_start = time()

        docs = ds[v].dropna()
        docs = [' '.join(doc) for doc in docs] if 'B' not in v else docs

        if USE_GPU:
            umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
            hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)
            if 'G' in v:
                bertopic = BERTopic(
                    language='multilingual',
                    umap_model=umap_model,
                    hdbscan_model=hdbscan_model,
                    n_gram_range=(1,3)
                )
            else:
                bertopic = BERTopic(
                    language='multilingual',
                    umap_model=umap_model,
                    hdbscan_model=hdbscan_model,
                )
        else:
            pipe = make_pipeline(
                TfidfVectorizer(),
                TruncatedSVD(100)
            )
            if 'G' in v:
                bertopic = BERTopic(embedding_model=pipe, n_gram_range=(1,3))
            else:
                bertopic = BERTopic(embedding_model=pipe)
        model = bertopic.fit(docs)

        t_end = time()
        model.save(
            f"{PATH_PREFIX_MODEL_BERTOPIC}{v}",
            serialization="safetensors",
            save_ctfidf=True,
        )
        r_time.append(t_end - t_start)
    elapse_time['bertopic_training'] = pd.Series(r_time)

In [None]:
save_models_bertopic()

Training DT: 100%|██████████| 12/12 [00:44<00:00,  3.72s/it]


In [None]:
elapse_time

Unnamed: 0,variant,tokenizing,preprocessing,lda_training,bertopic_training,lda_evaluation,bertopic_evaluation
0,dLG,2.061464,0.076667,0.0,10.91504,0.522905,5.086215
1,DLG,2.061464,0.076667,0.0,3.236732,0.600327,4.920509
2,dSP,2.061464,2.339595,1.55544,5.950782,0.70851,1.559066
3,DSP,2.061464,2.743002,6.910429,1.89604,0.636789,1.543857
4,dLN,2.061464,0.080067,1.499174,5.1091,0.488897,2.60563
5,DLN,2.061464,0.080067,9.134171,1.716769,0.789871,1.398466
6,dLW,2.061464,0.155874,1.580441,5.535383,0.499317,1.545107
7,DLW,2.061464,0.155875,6.300422,1.939281,0.439239,1.526137
8,dB,2.061464,0.006221,0.0,5.962377,0.0,1.675596
9,DB,2.061464,0.006221,0.0,1.91178,0.0,1.451739


In [None]:
elapse_time.to_csv(PATH_FILE_ELAPSE_TIME, index=False)