In [1]:
import multiprocessing


# my lib
import textlib as tl
import Word2VecModel as wv



In [2]:
WORKERS = multiprocessing.cpu_count() - 1

In [None]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

In [3]:
# input / output file name
#corpora_file_name = 'D:/ml/TCL2021_telco_embedding/dataset/telco_corpora.dat'
#w2v_model_file_name_prefix = 'D:/ml/TCL2021_telco_embedding/dataset/w2v/telco_w2v_'

corpora_file_name = '../../dataset/telco_embedding/corpora/telco_corpora.dat'
w2v_model_file_name_prefix = '../../dataset/telco_embedding/w2v/telco_w2v_'

In [8]:
# 여러개의 w2v 모델을 만들기 위한 table
MODEL_COUNT = 9
W2V_TRAIN_PARAMS = {
    'MODEL_NAME': ['V10000_E100_W3_E50','V10000_E100_W5_E50','V10000_E100_W7_E50',
                   'V10000_E200_W3_E50','V10000_E200_W5_E50','V10000_E200_W7_E50',
                   'V10000_E300_W3_E50','V10000_E300_W5_E50','V10000_E300_W7_E50'],
    'MAX_VOCAB_SIZE': [10000] * MODEL_COUNT,
    'EMBEDDING_SIZE': [100,100,100,200,200,200,300,300,300],
    'WINDOW_SIZE' : [3,5,7,3,5,7,3,5,7],
    'EPOCHS': [10] * MODEL_COUNT
}

# parameter 잘못 넣었는지 검증
assert len(W2V_TRAIN_PARAMS['MODEL_NAME']) == MODEL_COUNT
assert len(W2V_TRAIN_PARAMS['MAX_VOCAB_SIZE']) == MODEL_COUNT
assert len(W2V_TRAIN_PARAMS['EMBEDDING_SIZE']) == MODEL_COUNT
assert len(W2V_TRAIN_PARAMS['WINDOW_SIZE']) == MODEL_COUNT
assert len(W2V_TRAIN_PARAMS['EPOCHS']) == MODEL_COUNT

In [5]:
def create_multi_w2v_model(model_count, params):
    for i, (model_name, max_vocab_size, embedding_size, window_size, epochs) in \
            enumerate(zip(params['MODEL_NAME'],
                          params['MAX_VOCAB_SIZE'],
                          params['EMBEDDING_SIZE'],
                          params['WINDOW_SIZE'],
                          params['EPOCHS'])):
        print(f'---- {i} 시작!! ----')
        w2v_model = wv.Word2VecModel()
        w2v_model.create(corpora_file_name, 
                           w2v_model_file_name_prefix + model_name, 
                           max_vocab_size=max_vocab_size, 
                           embedding_size=embedding_size,
                           epochs=epochs,
                           window=window_size,
                           workers=WORKERS)    
        
        if i + 1 >= model_count:
            break

In [9]:
# 테스트로 하나만 만들자.
model_create_count = 1
create_multi_w2v_model(model_create_count, W2V_TRAIN_PARAMS)

---- 0 시작!! ----
10000개의 단어 내에서 최소 빈도수는 45입니다.
Epoch: 1	Loss after epoch 1: 22115754.0
Epoch: 2	Loss after epoch 2: 15975354.0
Epoch: 3	Loss after epoch 3: 11487016.0
Epoch: 4	Loss after epoch 4: 11470444.0
Epoch: 5	Loss after epoch 5: 7052928.0
Epoch: 6	Loss after epoch 6: 1959896.0
Epoch: 7	Loss after epoch 7: 1940496.0
Epoch: 8	Loss after epoch 8: 1902128.0
Epoch: 9	Loss after epoch 9: 1801008.0
Epoch: 10	Loss after epoch 10: 1789888.0


In [10]:
# 테스트로 0번째 모델 한번 가져와보자.
i = 0
w2v_model = wv.Word2VecModel()
w2v_model.load(w2v_model_file_name_prefix + W2V_TRAIN_PARAMS['MODEL_NAME'][i])

print( len(w2v_model.index2word) )
print( len(w2v_model.word2index) )
print( len(w2v_model.weight) )

print( w2v_model.index2word[200] )
print( w2v_model.word2index['약정'] )
print( w2v_model.weight[2583] )
print( w2v_model.norm_weight[2583] )

10031
10031
10031
채널
87
[-1.28704607  0.60899276  1.52758682  0.14216805 -0.03208986 -0.35162911
 -0.23014656 -0.54449201 -0.13295713  0.05977995  0.17421292 -0.23925877
 -0.02696096 -0.27940512 -0.1087674  -0.1742404   0.71315104 -0.15916385
  0.44866425 -0.93803859  0.28822842  0.04694553  0.49181566 -0.73227102
 -0.58628809 -0.53829026 -0.9319514   0.00437926 -0.07131036  0.97412002
 -0.44056585  0.72468114  0.1171393  -0.00634099 -0.24705975  1.00534046
 -0.93818158  0.72065139 -0.1647618   0.42863211  0.34628803 -0.70248997
  0.08718582 -0.41112477  0.36475015  0.17013627 -0.0905133  -0.45467418
  0.05980843 -0.40604788  0.37260205 -0.90084326 -0.51738024 -0.08958094
 -0.16105573  0.55400401  0.58894467  0.00735143  0.2127317   1.16864502
  0.02616288  1.17424631 -0.21616504 -0.17483246 -0.95434606 -0.55734736
 -0.5193339  -0.21042204 -0.57819569 -0.49153396 -0.16887598  0.92017525
  1.06740928 -0.67761111  0.32668903  0.08866875 -0.02258844 -0.58247453
  0.61903286 -0.10279644  0