In [7]:
import sys
import tomotopy as tp
import numpy as np
import optuna

In [14]:
class Objective:
    def __init__(self, corpus):
        # 変数X,yの初期化
        self.corpus = corpus

    def __call__(self, trial):
        # ハイパーパラメータの設定
        params = {
            'eta_g': trial.suggest_uniform('eta_g', 0.001, 0.05),
            'eta_l': trial.suggest_uniform('eta_l', 0.001, 0.05),

            'alpha_g': trial.suggest_uniform('alpha_g', 0.001, 0.05),
            'alpha_l': trial.suggest_uniform('alpha_l', 0.001, 0.05),
            'alpha_mg': trial.suggest_uniform('alpha_mg', 0.001, 0.05),
            'alpha_ml': trial.suggest_uniform('alpha_ml', 0.001, 0.05),
            'gamma': trial.suggest_uniform('alpha_ml', 0.001, 0.05),

            'k_g': trial.suggest_int('k_g', 3, 10),
            'k_l': trial.suggest_int('k_l', 3, 10),
            't': trial.suggest_int('t', 1, 10),
            # 'smoothing_alpha': trial.suggest_float('smoothing_alpha', 0.001, 0.1),
        }

        mdl = tp.MGLDAModel(**params)
        mdl.add_corpus(self.corpus)
        mdl.train(1000)
        # 評価指標として正解率の最大化を目指す
        
        return mdl.perplexity

In [15]:
# C:/Users/yuuta/Documents/fashion/data/images
with open('C:/Users/yuuta/Documents/fashion/model_learning/topic_model/train.txt', 'r', encoding="utf-8") as f:
    input_lines = f.read().splitlines()

print('Running LDA')

corpus = tp.utils.Corpus()
for line in input_lines:
    line = list(map(lambda x: x.strip(), line.split(',')))
    corpus.add_doc(line)

Running LDA


In [16]:
print(len(input_lines))
objective = Objective(corpus)
study = optuna.create_study(direction='minimize') # 最大化
study.optimize(objective, timeout=7200 * 2)

# ベストパラメータを出力
print('params:', study.best_params)

[I 2023-12-28 14:05:44,020] A new study created in memory with name: no-name-0c5fa35f-3fce-482a-841c-1af9fca21349


54206


  'eta_g': trial.suggest_uniform('eta', 0.001, 0.05),
  'eta_l': trial.suggest_uniform('eta', 0.001, 0.05),
  'alpha_g': trial.suggest_uniform('alpha_g', 0.001, 0.05),
  'alpha_l': trial.suggest_uniform('alpha_l', 0.001, 0.05),
  'alpha_mg': trial.suggest_uniform('alpha_mg', 0.001, 0.05),
  'alpha_ml': trial.suggest_uniform('alpha_ml', 0.001, 0.05),
  'gamma': trial.suggest_uniform('alpha_ml', 0.001, 0.05),
[I 2023-12-28 14:06:13,701] Trial 0 finished with value: 181.00497507001663 and parameters: {'eta': 0.012469822914552137, 'alpha_g': 0.0031652228911695644, 'alpha_l': 0.013998915734675672, 'alpha_mg': 0.01507483366485737, 'alpha_ml': 0.017448750969182435, 'k_g': 10, 'k_l': 7, 't': 9}. Best is trial 0 with value: 181.00497507001663.
[I 2023-12-28 14:06:35,224] Trial 1 finished with value: 145.22157242021916 and parameters: {'eta': 0.01000181656858454, 'alpha_g': 0.03904577998816067, 'alpha_l': 0.0022182775831299647, 'alpha_mg': 0.018522518104228235, 'alpha_ml': 0.0068982762767909244,

params: {'eta': 0.04904295572460257, 'alpha_g': 0.005906699085893546, 'alpha_l': 0.044259914451423986, 'alpha_mg': 0.04611549341559861, 'alpha_ml': 0.0010281711694367847, 'k_g': 4, 'k_l': 3, 't': 1}


In [22]:
# params = {'eta': 0.08913102044757398, 'k': 5, 'tw': 1, 'smoothing_alpha': 0.053073099665460755}
params = {'eta_g': 0.04904295572460257, 'alpha_g': 0.005906699085893546, 'alpha_l': 0.044259914451423986, 'alpha_mg': 0.04611549341559861, 'alpha_ml': 0.0010281711694367847, 'k_g': 4, 'k_l': 3, 't': 1}
# print(study.best_params)
mdl = tp.MGLDAModel(**params)
# mdl = tp.CTModel(**params)
mdl.add_corpus(corpus)
mdl.train(1000)
# MGLDAModel
mdl.save('C:/Users/yuuta/Documents/fashion/model_learning/topic_model/models/' + f'MGLDA.bin', True)

{'eta': 0.04904295572460257, 'alpha_g': 0.005906699085893546, 'alpha_l': 0.044259914451423986, 'alpha_mg': 0.04611549341559861, 'alpha_ml': 0.0010281711694367847, 'k_g': 4, 'k_l': 3, 't': 1}


In [None]:
# mdl = tp.LDAModel(**study.best_params)
# mdl.add_corpus(corpus)
# mdl.train(1000)
# mdl.save('C:/Users/yuuta/Documents/fashion/model_learning/topic_model/models/' + f'ctm.bin', True)

In [24]:
for k in range(mdl.k):
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=10))

Top 10 words of topic #0
[('tops_Silk', 0.053122829645872116), ('bottoms_Blue', 0.0451885461807251), ('tops_White', 0.0429576113820076), ('shoes_geo', 0.038999803364276886), ('tops_Denim', 0.03167225793004036), ('shoes_plain', 0.030953506007790565), ('shoes_Black', 0.03044944442808628), ('tops_plainbottoms_Silk', 0.023476608097553253), ('tops_plainbottoms_Denim', 0.021777737885713577), ('tops_Black', 0.019621478393673897)]
Top 10 words of topic #1
[('tops_Silk', 0.07015150040388107), ('tops_White', 0.06482736766338348), ('tops_Blue', 0.05684599280357361), ('tops_Black', 0.04528943449258804), ('tops_Denim', 0.04362745210528374), ('shoes_geo', 0.04144369065761566), ('shoes_Black', 0.035201601684093475), ('shoes_plain', 0.030070722103118896), ('tops_plaintops_Silk', 0.028688959777355194), ('tops_Navy', 0.02839907817542553)]
Top 10 words of topic #2
[('bottoms_Denim', 0.06662385910749435), ('bottoms_Silk', 0.06299417465925217), ('shoes_geo', 0.05615757033228874), ('bottoms_Blue', 0.0498468