In [1]:
import multiprocessing


# my lib
import textlib as tl
import Word2VecModel as wv

In [3]:
WORKERS = multiprocessing.cpu_count() - 2

In [4]:
# input / output file name
corpora_file_name = 'D:/ml/TCL2021_telco_embedding/dataset/telco_corpora.dat'
w2v_model_file_name_prefix = 'D:/ml/TCL2021_telco_embedding/dataset/w2v/telco_w2v_'

In [5]:
# 여러개의 w2v 모델을 만들기 위한 table
MODEL_COUNT = 9
W2V_TRAIN_PARAMS = {
    'MODEL_NAME': ['V10000_E100_W3_E50','V10000_E100_W5_E50','V10000_E100_W7_E50',
                   'V10000_E200_W3_E50','V10000_E200_W5_E50','V10000_E200_W7_E50',
                   'V10000_E300_W3_E50','V10000_E300_W5_E50','V10000_E300_W7_E50'],
    'MAX_VOCAB_SIZE': [10000] * MODEL_COUNT,
    'EMBEDDING_SIZE': [100,100,100,200,200,200,300,300,300],
    'WINDOW_SIZE' : [3,5,7,3,5,7,3,5,7],
    'EPOCHS': [50] * MODEL_COUNT
}

# parameter 잘못 넣었는지 검증
assert len(W2V_TRAIN_PARAMS['MODEL_NAME']) == MODEL_COUNT
assert len(W2V_TRAIN_PARAMS['MAX_VOCAB_SIZE']) == MODEL_COUNT
assert len(W2V_TRAIN_PARAMS['EMBEDDING_SIZE']) == MODEL_COUNT
assert len(W2V_TRAIN_PARAMS['WINDOW_SIZE']) == MODEL_COUNT
assert len(W2V_TRAIN_PARAMS['EPOCHS']) == MODEL_COUNT

In [6]:
def create_multi_w2v_model(model_count, params):
    for i, (model_name, max_vocab_size, embedding_size, window_size, epochs) in \
            enumerate(zip(params['MODEL_NAME'],
                          params['MAX_VOCAB_SIZE'],
                          params['EMBEDDING_SIZE'],
                          params['WINDOW_SIZE'],
                          params['EPOCHS'])):
        print(f'---- {i} 시작!! ----')
        w2v_model = wv.Word2VecModel()
        w2v_model.create(corpora_file_name, 
                           w2v_model_file_name_prefix + model_name, 
                           max_vocab_size=max_vocab_size, 
                           embedding_size=embedding_size,
                           epochs=epochs,
                           window=window_size,
                           workers=WORKERS)    
        
        if i + 1 >= model_count:
            break

In [8]:
# 테스트로 하나만 만들자.
model_create_count = 1
create_multi_w2v_model(model_create_count, W2V_TRAIN_PARAMS)

---- 0 시작!! ----
10000개의 단어 내에서 최소 빈도수는 43입니다.
Epoch: 1	Loss after epoch 1: 12431698.0
Epoch: 2	Loss after epoch 2: 9763266.0
Epoch: 3	Loss after epoch 3: 9019944.0
Epoch: 4	Loss after epoch 4: 6721448.0
Epoch: 5	Loss after epoch 5: 5902464.0
Epoch: 6	Loss after epoch 6: 5894952.0
Epoch: 7	Loss after epoch 7: 5884408.0
Epoch: 8	Loss after epoch 8: 5865208.0
Epoch: 9	Loss after epoch 9: 5667524.0
Epoch: 10	Loss after epoch 10: 1087136.0
Epoch: 11	Loss after epoch 11: 1092072.0
Epoch: 12	Loss after epoch 12: 1091512.0
Epoch: 13	Loss after epoch 13: 1077552.0
Epoch: 14	Loss after epoch 14: 1076736.0
Epoch: 15	Loss after epoch 15: 1060232.0
Epoch: 16	Loss after epoch 16: 1067376.0
Epoch: 17	Loss after epoch 17: 1053688.0
Epoch: 18	Loss after epoch 18: 1060096.0
Epoch: 19	Loss after epoch 19: 1056888.0
Epoch: 20	Loss after epoch 20: 1057376.0
Epoch: 21	Loss after epoch 21: 1023216.0
Epoch: 22	Loss after epoch 22: 1037632.0
Epoch: 23	Loss after epoch 23: 1022240.0
Epoch: 24	Loss after epoch 

In [9]:
# 테스트로 0번째 모델 한번 가져와보자.
i = 0
w2v_model = wv.Word2VecModel()
w2v_model.load(w2v_model_file_name_prefix + W2V_TRAIN_PARAMS['MODEL_NAME'][i])

print( len(w2v_model.index2word) )
print( len(w2v_model.word2index) )
print( len(w2v_model.weight) )

print( w2v_model.index2word[200] )
print( w2v_model.word2index['약정'] )
print( w2v_model.weight[2583] )
print( w2v_model.norm_weight[2583] )

10086
10086
10086
채널
83
[ 0.33492824 -0.14252254  0.30957681 -0.10566582  0.25233808 -0.3331652
  0.05035937  0.23592699  0.58263183 -0.29077598  0.28332579 -0.21423478
  0.41137639  0.22636791 -0.38939717  0.49806896 -0.38530859 -0.03138449
  0.28518784  0.44729728  0.27533096 -0.51609653  0.43539822  0.17607941
  0.57308578 -0.12231185  0.5415228  -0.03349705 -0.16551206  0.36280257
  0.44494292  0.27118194  0.82575399 -0.14198819  0.65345013 -0.12884913
 -0.51451284  0.239757   -0.1925742   0.43418971  0.03470463  0.50680977
 -0.24023718 -0.23700158  0.02458469  0.14715017  0.35727835 -0.38482237
  0.03579685  0.00836114  0.2008189  -0.05320696 -0.27793506 -0.24184163
  1.11740804  0.74650443 -0.51975143  0.157313   -0.02495085  0.26140296
  0.61399764  0.05733346 -0.23632856 -0.167316   -0.29453295 -0.47333759
  0.31321493 -0.41636497  0.10765453 -0.05395645  0.32566011 -0.10642837
  0.05548593 -0.74542451 -0.38708597  0.1759312  -0.49640876 -0.00934392
  0.58048266  0.66551787  0.