# LDA

In [1]:
import pickle
import pandas as pd

with open('./results/HAS_e.pickle', 'rb') as f:
    HAS_e = pickle.load(f)
HAS_t = pd.read_parquet('./results/HAS_t.parquet')

In [2]:
from time import time

from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from utils import get_diversity, get_topics_lda
import optuna

def lda(embedding):
    def objective(trial, get_lda=False):
        num_topics = trial.suggest_int('num_topics', 5, 100)
        alpha = trial.suggest_categorical('alpha_categorical', ['symmetric', 'asymmetric', 'scalar'])
        eta = trial.suggest_categorical('eta_categorical', ['symmetric', 'auto', 'scalar'])
        if alpha == 'scalar':
            alpha = trial.suggest_float('alpha', 0.01, 1)
        if eta == 'scalar':
            eta = trial.suggest_float('eta', 0.01, 1)
        model = LdaMulticore(
            corpus=embedding['corpus'],
            id2word=embedding['id2word'],
            num_topics=num_topics,
            alpha=alpha,
            eta=eta,
        )
        if get_lda:
            return model
        c = CoherenceModel(model, texts=embedding['T'], dictionary=embedding['id2word'], coherence='c_v')
        cs = c.get_coherence()
        ts = get_diversity(get_topics_lda(model, embedding['id2word']))
        return cs * ts

    t_start = time()
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=2) # FIXME: change n to 20

    best_trial = study.best_trial
    best_lda = objective(best_trial, get_lda=True)
    t_end = time()

    return {
        'model': best_lda,
        'time': t_end - t_start,
    }

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [3]:
def save_models_lda():
    r_time = []
    for v in HAS_e:
        model = lda(HAS_e[v])
        model['model'].save(f'./results/models_lda/{v}')
        r_time.append(model['time'])
    HAS_t['lda_training'] = pd.Series(r_time)

In [4]:
save_models_lda()

[I 2023-08-19 12:57:28,396] A new study created in memory with name: no-name-4e023c5d-1008-445f-a28f-e65df3e59df2
[I 2023-08-19 12:57:37,000] Trial 0 finished with value: 0.25667358882915076 and parameters: {'num_topics': 22, 'alpha_categorical': 'symmetric', 'eta_categorical': 'symmetric'}. Best is trial 0 with value: 0.25667358882915076.
[I 2023-08-19 12:57:47,196] Trial 1 finished with value: 0.18602927535236405 and parameters: {'num_topics': 74, 'alpha_categorical': 'asymmetric', 'eta_categorical': 'scalar', 'eta': 0.040344165527485557}. Best is trial 0 with value: 0.25667358882915076.
[I 2023-08-19 12:57:52,047] A new study created in memory with name: no-name-8922a887-c1dc-4238-8dbd-8b8a407a2039
[I 2023-08-19 12:58:00,794] Trial 0 finished with value: 0.009972078218902133 and parameters: {'num_topics': 85, 'alpha_categorical': 'scalar', 'eta_categorical': 'symmetric', 'alpha': 0.8496347936839019}. Best is trial 0 with value: 0.009972078218902133.
[I 2023-08-19 12:58:09,260] Trial

In [5]:
HAS_t

Unnamed: 0,variant,segmentation,preprocess,lda_embedding,lda_training
0,HT,0.005998611,422.800828,0.006001,23.647413
1,HCL,0.005998611,422.799833,0.003976,22.379721
2,HCLW,0.005998611,422.800833,0.004001,25.818747
3,HCLWN,0.005998611,422.812833,0.003005,22.219262
4,AT,1692424000.0,422.819834,0.091998,33.566865
5,ACL,1692424000.0,422.831834,0.081002,33.911662
6,ACLW,1692424000.0,422.838855,0.039994,28.696265
7,ACLWN,1692424000.0,423.086812,0.048004,32.202545
8,ST,42.04551,422.824833,0.110003,36.66292
9,SCL,42.04551,422.838843,0.107988,37.488737


# BERTopic

In [6]:
from time import time

from bertopic import BERTopic
import pandas as pd

embedding_model = "paraphrase-multilingual-MiniLM-L12-v2"

def save_models_bertopic():
    path_prefix = './results/models_bertopic/'
    r_time = []
    for v in HAS_e:
        t_start = time()
        bertopic = BERTopic(language='multilingual', embedding_model=embedding_model)
        model = bertopic.fit(HAS_e[v]['text'])
        t_end = time()
        model.save(
            f"{path_prefix}{v}",
            serialization="safetensors",
            save_embedding_model=embedding_model,
            save_ctfidf=True,
        )
        r_time.append(t_end - t_start)
    HAS_t['bertopic_training'] = pd.Series(r_time)

In [7]:
save_models_bertopic()

In [8]:
HAS_t

Unnamed: 0,variant,segmentation,preprocess,lda_embedding,lda_training,bertopic_training
0,HT,0.005998611,422.800828,0.006001,23.647413,21.27378
1,HCL,0.005998611,422.799833,0.003976,22.379721,9.136819
2,HCLW,0.005998611,422.800833,0.004001,25.818747,9.151057
3,HCLWN,0.005998611,422.812833,0.003005,22.219262,8.910228
4,AT,1692424000.0,422.819834,0.091998,33.566865,20.32113
5,ACL,1692424000.0,422.831834,0.081002,33.911662,20.947009
6,ACLW,1692424000.0,422.838855,0.039994,28.696265,19.44947
7,ACLWN,1692424000.0,423.086812,0.048004,32.202545,18.919627
8,ST,42.04551,422.824833,0.110003,36.66292,94.256775
9,SCL,42.04551,422.838843,0.107988,37.488737,90.736179


In [9]:
HAS_t.to_parquet('./results/HAS_t.parquet')