In [10]:
import sys
import tomotopy as tp
import numpy as np
import optuna

In [11]:
models = {
    'LDA': tp.LDAModel,
    'MGLDA': tp.MGLDAModel,
    'CT': tp.CTModel,
}
model_params = {
    'LDA': {
        'tw': ['tw', 0, 2],
        'k': ['k', 1, 10],
        'alpha': ['alpha', 0.001, 0.05],
        'eta': ['eta', 0.001, 0.05],
    },
    'CT': {
        'tw': ['tw', 0, 2],
        'k': ['k', 1, 10],
        'smoothing_alpha': ['smoothing_alpha', 0.001, 0.05],
        'eta': ['eta', 0.001, 0.05],
    },
    'MGLDA': {
        'eta_g': ['eta_g', 0.001, 0.05],
        'eta_l': ['eta_l', 0.001, 0.05],

        'alpha_g': ['alpha_g', 0.001, 0.05],
        'alpha_l': ['alpha_l', 0.001, 0.05],
        'alpha_mg': ['alpha_mg', 0.001, 0.05],
        'alpha_ml':['alpha_ml', 0.001, 0.05],
        'gamma': ['gamma', 0.001, 0.05],

        'k_g': ['k_g', 3, 10],
        'k_l': ['k_l', 3, 10],
        't': ['t', 1, 10],
    },
}

In [12]:
KIND = 'MGLDA'

In [13]:
def to_trial(trial, p):
    if type(p[1]) is int and type(p[2]) is int:
        return trial.suggest_int(*p)
    return trial.suggest_float(*p)

class Objective:
    def __init__(self, corpus):
        # 変数X,yの初期化
        self.corpus = corpus

    def __call__(self, trial):
        # ハイパーパラメータの設定
        params = {}
        for key, p in model_params[KIND].items():
            params[key] = to_trial(trial, p)
        mdl = models[KIND](**params)
        mdl.add_corpus(self.corpus)
        mdl.train(1000)
        # 評価指標として正解率の最大化を目指す
        
        return mdl.perplexity

In [14]:
with open('C:/Users/yuuta/Documents/fashion/model_learning/topic_model/train_new.txt', 'r', encoding="utf-8") as f:
    input_lines = f.read().splitlines()

print('Running LDA')

corpus = tp.utils.Corpus()
for line in input_lines:
    line = list(map(lambda x: x.strip(), line.split(',')))
    corpus.add_doc(line)

Running LDA


In [15]:
objective = Objective(corpus)
study = optuna.create_study(direction='minimize') # 最大化
study.optimize(objective, timeout=7200 * 2)

# ベストパラメータを出力
print('params:', study.best_params)

[I 2024-01-17 13:25:28,188] A new study created in memory with name: no-name-0a68f0c3-f98c-4d82-a577-d3dd0eaf2fde


In [None]:
# print(KIND, study.best_params)
KIND = 'LDA'
params = {'tw': 0, 'k': 4, 'alpha': 0.02566824950078614, 'eta': 0.04823495632461025}
# mdl = models[KIND](**study.best_params)
mdl = models[KIND](**params)
mdl.add_corpus(corpus)
mdl.train(1000)
# MGLDAModel
mdl.save('C:/Users/yuuta/Documents/fashion/model_learning/topic_model/models/' + f'{KIND}.bin', True)

In [None]:
for k in range(mdl.k):
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=10))

Top 10 words of topic #0
[('tops_Silk', 0.08674614131450653), ('shoes_geo', 0.04992065578699112), ('tops_White', 0.04979587346315384), ('tops_plaintops_Silk', 0.049421526491642), ('shoes_plain', 0.04735894873738289), ('shoes_Black', 0.04284476116299629), ('tops_Beige', 0.03963711857795715), ('tops_Black', 0.03942425549030304), ('tops_plainshoes_Silk', 0.03416871279478073), ('tops_Gray', 0.026072537526488304)]
Top 10 words of topic #1
[('shoes_geo', 0.05843142047524452), ('tops_Silk', 0.05638058856129646), ('shoes_plain', 0.05298145115375519), ('shoes_Black', 0.04472843185067177), ('bottoms_Blue', 0.04367108270525932), ('tops_Denim', 0.03834174573421478), ('tops_plainbottoms_Silk', 0.02993970736861229), ('tops_Wool', 0.02986874431371689), ('tops_plainbottoms_Denim', 0.0291307270526886), ('tops_White', 0.0255825687199831)]
Top 10 words of topic #2
[('bottoms_Denim', 0.060762979090213776), ('bottoms_Silk', 0.05448056012392044), ('shoes_geo', 0.05190655216574669), ('bottoms_Blue', 0.048319