In [26]:
import sys
import tomotopy as tp
import numpy as np
import optuna

In [27]:
models = {
    'LDA': tp.LDAModel,
    'MGLDA': tp.MGLDAModel,
    'CT': tp.CTModel,
}
model_params = {
    'LDA': {
        'tw': ['tw', 0, 2],
        'k': ['k', 1, 10],
        'alpha': ['alpha', 0.001, 0.05],
        'eta': ['eta', 0.001, 0.05],
    },
    'CT': {
        'tw': ['tw', 0, 2],
        'k': ['k', 1, 10],
        'smoothing_alpha': ['smoothing_alpha', 0.001, 0.05],
        'eta': ['eta', 0.001, 0.05],
    },
    'MGLDA': {
        'eta_g': ['eta_g', 0.001, 0.05],
        'eta_l': ['eta_l', 0.001, 0.05],

        'alpha_g': ['alpha_g', 0.001, 0.05],
        'alpha_l': ['alpha_l', 0.001, 0.05],
        'alpha_mg': ['alpha_mg', 0.001, 0.05],
        'alpha_ml':['alpha_ml', 0.001, 0.05],
        'gamma': ['gamma', 0.001, 0.05],

        'k_g': ['k_g', 3, 10],
        'k_l': ['k_l', 3, 10],
        't': ['t', 1, 10],
    },
}

In [28]:
KIND = 'LDA'

In [29]:
def to_trial(trial, p):
    if type(p[1]) is int and type(p[2]) is int:
        return trial.suggest_int(*p)
    return trial.suggest_float(*p)

class Objective:
    def __init__(self, corpus):
        # 変数X,yの初期化
        self.corpus = corpus

    def __call__(self, trial):
        # ハイパーパラメータの設定
        params = {}
        for key, p in model_params[KIND].items():
            params[key] = to_trial(trial, p)
        mdl = models[KIND](**params)
        mdl.add_corpus(self.corpus)
        mdl.train(1000)
        # 評価指標として正解率の最大化を目指す
        
        return mdl.perplexity

In [30]:
models[KIND](k=3)

<tomotopy.LDAModel at 0x2134e374030>

In [31]:
with open('C:/Users/yuuta/Documents/fashion/model_learning/topic_model/train.txt', 'r', encoding="utf-8") as f:
    input_lines = f.read().splitlines()

print('Running LDA')

corpus = tp.utils.Corpus()
for line in input_lines:
    line = list(map(lambda x: x.strip(), line.split(',')))
    corpus.add_doc(line)

Running LDA


In [32]:
objective = Objective(corpus)
study = optuna.create_study(direction='minimize') # 最大化
study.optimize(objective, timeout=7200 * 2)

# ベストパラメータを出力
print('params:', study.best_params)

[I 2024-01-13 15:34:26,657] A new study created in memory with name: no-name-1f5c643b-9a00-4c0b-a050-822a71b90b62
[I 2024-01-13 15:34:48,524] Trial 0 finished with value: 135.68720308495466 and parameters: {'tw': 2, 'k': 8, 'alpha': 0.036391676771514704, 'eta': 0.00875386903257592}. Best is trial 0 with value: 135.68720308495466.
[I 2024-01-13 15:35:11,308] Trial 1 finished with value: 170.24823082992492 and parameters: {'tw': 1, 'k': 6, 'alpha': 0.02172933683389644, 'eta': 0.032200784227534654}. Best is trial 0 with value: 135.68720308495466.
[I 2024-01-13 15:35:35,309] Trial 2 finished with value: 103.30985772931699 and parameters: {'tw': 0, 'k': 2, 'alpha': 0.013547907369925297, 'eta': 0.012007744184023793}. Best is trial 2 with value: 103.30985772931699.
[I 2024-01-13 15:35:58,102] Trial 3 finished with value: 101.5056231316964 and parameters: {'tw': 0, 'k': 7, 'alpha': 0.04674808995822662, 'eta': 0.013386335975764548}. Best is trial 3 with value: 101.5056231316964.
[I 2024-01-13 1

In [None]:
print(KIND, study.best_params)
mdl = models[KIND](**study.best_params)
# mdl = tp.CTModel(**params)
mdl.add_corpus(corpus)
mdl.train(1000)
# MGLDAModel
mdl.save('C:/Users/yuuta/Documents/fashion/model_learning/topic_model/models/' + f'{KIND}.bin', True)

{'eta': 0.04904295572460257, 'alpha_g': 0.005906699085893546, 'alpha_l': 0.044259914451423986, 'alpha_mg': 0.04611549341559861, 'alpha_ml': 0.0010281711694367847, 'k_g': 4, 'k_l': 3, 't': 1}


In [None]:
for k in range(mdl.k):
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=10))

Top 10 words of topic #0
[('tops_Silk', 0.053122829645872116), ('bottoms_Blue', 0.0451885461807251), ('tops_White', 0.0429576113820076), ('shoes_geo', 0.038999803364276886), ('tops_Denim', 0.03167225793004036), ('shoes_plain', 0.030953506007790565), ('shoes_Black', 0.03044944442808628), ('tops_plainbottoms_Silk', 0.023476608097553253), ('tops_plainbottoms_Denim', 0.021777737885713577), ('tops_Black', 0.019621478393673897)]
Top 10 words of topic #1
[('tops_Silk', 0.07015150040388107), ('tops_White', 0.06482736766338348), ('tops_Blue', 0.05684599280357361), ('tops_Black', 0.04528943449258804), ('tops_Denim', 0.04362745210528374), ('shoes_geo', 0.04144369065761566), ('shoes_Black', 0.035201601684093475), ('shoes_plain', 0.030070722103118896), ('tops_plaintops_Silk', 0.028688959777355194), ('tops_Navy', 0.02839907817542553)]
Top 10 words of topic #2
[('bottoms_Denim', 0.06662385910749435), ('bottoms_Silk', 0.06299417465925217), ('shoes_geo', 0.05615757033228874), ('bottoms_Blue', 0.0498468