In [1]:
import sys
import tomotopy as tp
import numpy as np
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
models = {
    'LDA': tp.LDAModel,
    'MGLDA': tp.MGLDAModel,
    'CT': tp.CTModel,
}
model_params = {
    'LDA': {
        'tw': ['tw', 0, 2],
        'k': ['k', 1, 10],
        'alpha': ['alpha', 0.001, 0.05],
        'eta': ['eta', 0.001, 0.05],
    },
    'CT': {
        'tw': ['tw', 0, 2],
        'k': ['k', 1, 10],
        'smoothing_alpha': ['smoothing_alpha', 0.001, 0.05],
        'eta': ['eta', 0.001, 0.05],
    },
    'MGLDA': {
        'eta_g': ['eta_g', 0.001, 0.05],
        'eta_l': ['eta_l', 0.001, 0.05],

        'alpha_g': ['alpha_g', 0.001, 0.05],
        'alpha_l': ['alpha_l', 0.001, 0.05],
        'alpha_mg': ['alpha_mg', 0.001, 0.05],
        'alpha_ml':['alpha_ml', 0.001, 0.05],
        'gamma': ['gamma', 0.001, 0.05],

        'k_g': ['k_g', 3, 10],
        'k_l': ['k_l', 3, 10],
        't': ['t', 1, 10],
    },
}

In [3]:
KIND = 'LDA'

In [4]:
def to_trial(trial, p):
    if type(p[1]) is int and type(p[2]) is int:
        return trial.suggest_int(*p)
    return trial.suggest_float(*p)

class Objective:
    def __init__(self, corpus):
        # 変数X,yの初期化
        self.corpus = corpus

    def __call__(self, trial):
        # ハイパーパラメータの設定
        params = {}
        for key, p in model_params[KIND].items():
            params[key] = to_trial(trial, p)
        mdl = models[KIND](**params)
        mdl.add_corpus(self.corpus)
        mdl.train(1000)
        # 評価指標として正解率の最大化を目指す
        
        return mdl.perplexity

In [5]:
with open('C:/Users/yuuta/Documents/fashion/model_learning/topic_model/train.txt', 'r', encoding="utf-8") as f:
    input_lines = f.read().splitlines()

print('Running LDA')

corpus = tp.utils.Corpus()
for line in input_lines:
    line = list(map(lambda x: x.strip(), line.split(',')))
    corpus.add_doc(line)

Running LDA


In [6]:
objective = Objective(corpus)
study = optuna.create_study(direction='minimize') # 最大化
study.optimize(objective, timeout=7200 * 2)

# ベストパラメータを出力
print('params:', study.best_params)

[I 2024-01-15 15:06:49,034] A new study created in memory with name: no-name-20b0737d-89be-4522-9a77-96e82568f31c
[I 2024-01-15 15:06:58,441] Trial 0 finished with value: 125.61248742979967 and parameters: {'tw': 2, 'k': 10, 'alpha': 0.03321625155360891, 'eta': 0.008955990866143973}. Best is trial 0 with value: 125.61248742979967.
[I 2024-01-15 15:07:07,062] Trial 1 finished with value: 196.56786067814286 and parameters: {'tw': 1, 'k': 2, 'alpha': 0.04610867651394655, 'eta': 0.026622701763265394}. Best is trial 0 with value: 125.61248742979967.
[I 2024-01-15 15:07:16,289] Trial 2 finished with value: 87.24878012846158 and parameters: {'tw': 0, 'k': 5, 'alpha': 0.01492650922065938, 'eta': 0.006181174981674643}. Best is trial 2 with value: 87.24878012846158.
[I 2024-01-15 15:07:26,067] Trial 3 finished with value: 177.02890824047935 and parameters: {'tw': 1, 'k': 4, 'alpha': 0.04416051220571385, 'eta': 0.011870685164316174}. Best is trial 2 with value: 87.24878012846158.
[I 2024-01-15 15

In [None]:
# print(KIND, study.best_params)
KIND = 'LDA'
# params = {'tw': 0, 'k': 7, 'alpha': 0.03729793398684788, 'eta': 0.01874366413777168}
mdl = models[KIND](**study.best_params)
# mdl = models[KIND](**params)
mdl.add_corpus(corpus)
mdl.train(1000)
# MGLDAModel
mdl.save('C:/Users/yuuta/Documents/fashion/model_learning/topic_model/models/' + f'{KIND}.bin', True)

In [None]:
for k in range(mdl.k):
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=10))