# LDA

In [96]:
import pickle

with open('./datasets/small/embeddings/e_H.pickle', 'rb') as f:
    e_H = pickle.load(f)
with open('./datasets/small/embeddings/e_A.pickle', 'rb') as f:
    e_A = pickle.load(f)
with open('./datasets/small/embeddings/e_S.pickle', 'rb') as f:
    e_S = pickle.load(f)

In [97]:
from time import time

from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from utils import get_diversity, get_topics_lda
import optuna

def lda(embedding):
    def objective(trial, get_lda=False):
        num_topics = trial.suggest_int('num_topics', 5, 100)
        alpha = trial.suggest_categorical('alpha_categorical', ['symmetric', 'asymmetric', 'scalar'])
        eta = trial.suggest_categorical('eta_categorical', ['symmetric', 'auto', 'scalar'])
        if alpha == 'scalar':
            alpha = trial.suggest_float('alpha', 0.01, 1)
        if eta == 'scalar':
            eta = trial.suggest_float('eta', 0.01, 1)
        model = LdaMulticore(
            corpus=embedding['corpus'],
            id2word=embedding['id2word'],
            num_topics=num_topics,
            alpha=alpha,
            eta=eta,
        )
        if get_lda:
            return model
        c = CoherenceModel(model, texts=embedding['T'], dictionary=embedding['id2word'], coherence='c_v')
        cs = c.get_coherence()
        ts = get_diversity(get_topics_lda(model, embedding['id2word']))
        return (cs + ts) / 2

    t_start = time()
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=2) # FIXME: change n to 20

    best_trial = study.best_trial
    best_lda = objective(best_trial, get_lda=True)
    t_end = time()
    return {
        'best_params': study.best_params,
        'best_trial': best_trial,
        'time': t_end - t_start,
        'model': best_lda,
    }

In [98]:
def save_models_lda(embeddings, variant):
    for name in variant:
        model = lda(embeddings[name])
        with open(f'./datasets/small/models_lda/{name}.pickle', 'wb') as f:
            pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)
        del model

In [99]:
# e_variant = ['T', 'C', 'CL', 'CLW', 'CW', 'L', 'LW', 'W']
e_variant = ['T', 'CLW']

In [100]:
# This takes 1.5 hour
save_models_lda(e_H, [f'H{t}' for t in e_variant])
save_models_lda(e_A, [f'A{t}' for t in e_variant])
# e_names = [f'S{t}' for t in e_variant]
# save_models_lda(e_S, e_names)

[I 2023-08-16 11:53:28,438] A new study created in memory with name: no-name-f837bd8c-59de-4da5-b5c1-e7690c6d14df


[I 2023-08-16 11:53:34,869] Trial 0 finished with value: 0.4671821787943199 and parameters: {'num_topics': 7, 'alpha_categorical': 'scalar', 'eta_categorical': 'scalar', 'alpha': 0.23117293650814605, 'eta': 0.3378406823802936}. Best is trial 0 with value: 0.4671821787943199.
[I 2023-08-16 11:53:44,202] Trial 1 finished with value: 0.524287415877537 and parameters: {'num_topics': 40, 'alpha_categorical': 'asymmetric', 'eta_categorical': 'scalar', 'eta': 0.9485152772613219}. Best is trial 1 with value: 0.524287415877537.
[I 2023-08-16 11:53:47,925] A new study created in memory with name: no-name-9530d558-f8fa-4e16-af51-cc915af3d33f
[I 2023-08-16 11:53:54,778] Trial 0 finished with value: 0.6451638591828325 and parameters: {'num_topics': 12, 'alpha_categorical': 'asymmetric', 'eta_categorical': 'auto'}. Best is trial 0 with value: 0.6451638591828325.
[I 2023-08-16 11:54:07,015] Trial 1 finished with value: 0.5198677229628182 and parameters: {'num_topics': 82, 'alpha_categorical': 'symmet

# BERTopic

In [101]:
import pickle

with open('./datasets/small/embeddings/e_H.pickle', 'rb') as f:
    e_H = pickle.load(f)
with open('./datasets/small/embeddings/e_A.pickle', 'rb') as f:
    e_A = pickle.load(f)
with open('./datasets/small/embeddings/e_S.pickle', 'rb') as f:
    e_S = pickle.load(f)

In [102]:
from time import time

from bertopic import BERTopic
from utils import get_topics_bertopic
import pandas as pd

embedding_model = "paraphrase-multilingual-MiniLM-L12-v2"
bertopic = BERTopic(embedding_model=embedding_model)

def save_models_bertopic(embeddings, variant):
    path_prefix = 'datasets/small/models_bertopic/'
    for v in variant:
        t_start = time()
        model = bertopic.fit(embeddings[v]['text'])
        t_end = time()
        model.save(
            f"{path_prefix}{v}",
            serialization="safetensors",
            save_embedding_model=embedding_model,
            save_ctfidf=True,
        )
        pd.DataFrame([{ 'time': t_end - t_start }]).to_csv(f'{path_prefix}{v}/time.csv', index=False)
        # print(get_topics_bertopic(model))
        del model

In [104]:
# e_variant = ['T', 'C', 'CL', 'CLW', 'CW', 'L', 'LW', 'W']
e_variant = ['T', 'CLW']
save_models_bertopic(e_H, [f'H{v}' for v in e_variant])
save_models_bertopic(e_A, [f'A{v}' for v in e_variant])
# save_models_bertopic(e_S, [f'S{v}' for v in e_variant])