In [2]:
import multiprocessing


# my lib
import textlib as tl
import Word2VecModel as wv

In [3]:
WORKERS = multiprocessing.cpu_count() - 1

In [4]:
# input / output file name
corpora_file_name = 'D:/dataset/telco_corpora.dat'
w2v_model_file_name_prefix = f'D:/dataset/w2v/telco_w2v_'

In [5]:
# 여러개의 w2v 모델을 만들기 위한 table
MODEL_COUNT = 9
W2V_TRAIN_PARAMS = {
    'MODEL_NAME': ['V10000_E100_W3_E50','V10000_E100_W5_E50','V10000_E100_W7_E50',
                   'V10000_E200_W3_E50','V10000_E200_W5_E50','V10000_E200_W7_E50',
                   'V10000_E300_W3_E50','V10000_E300_W5_E50','V10000_E300_W7_E50'],
    'MAX_VOCAB_SIZE': [10000] * MODEL_COUNT,
    'EMBEDDING_SIZE': [100,100,100,200,200,200,300,300,300],
    'WINDOW_SIZE' : [3,5,7,3,5,7,3,5,7],
    'EPOCHS': [50] * MODEL_COUNT
}

# parameter 잘못 넣었는지 검증
assert len(W2V_TRAIN_PARAMS['MODEL_NAME']) == MODEL_COUNT
assert len(W2V_TRAIN_PARAMS['MAX_VOCAB_SIZE']) == MODEL_COUNT
assert len(W2V_TRAIN_PARAMS['EMBEDDING_SIZE']) == MODEL_COUNT
assert len(W2V_TRAIN_PARAMS['WINDOW_SIZE']) == MODEL_COUNT
assert len(W2V_TRAIN_PARAMS['EPOCHS']) == MODEL_COUNT

In [6]:
def create_multi_w2v_model(model_count, params):
    for i, (model_name, max_vocab_size, embedding_size, window_size, epochs) in \
            enumerate(zip(params['MODEL_NAME'],
                          params['MAX_VOCAB_SIZE'],
                          params['EMBEDDING_SIZE'],
                          params['WINDOW_SIZE'],
                          params['EPOCHS'])):
        print(f'---- {i} 시작!! ----')
        w2v_model = wv.Word2VecModel()
        w2v_model.create(corpora_file_name, 
                           w2v_model_file_name_prefix + model_name, 
                           max_vocab_size=max_vocab_size, 
                           embedding_size=embedding_size,
                           epochs=epochs,
                           window=window_size,
                           workers=WORKERS)    
        
        if i + 1 >= model_count:
            break

In [7]:
# 테스트로 하나만 만들자.
model_create_count = 1
create_multi_w2v_model(model_create_count, W2V_TRAIN_PARAMS)

---- 0 시작!! ----
10000개의 단어 내에서 최소 빈도수는 43입니다.


In [8]:
# 테스트로 0번째 모델 한번 가져와보자.
i = 0
w2v_model = wv.Word2VecModel()
w2v_model.load(w2v_model_file_name_prefix + W2V_TRAIN_PARAMS['MODEL_NAME'][i])

print( len(w2v_model.index2word) )
print( len(w2v_model.word2index) )
print( len(w2v_model.weight) )

print( w2v_model.index2word[200] )
print( w2v_model.word2index['약정'] )
print( w2v_model.weight[2583] )
print( w2v_model.norm_weight[2583] )

10086
10086
10086
채널
83
[-0.86657643 -0.98315769 -0.7253235   0.00111035  0.38903692 -0.36790097
 -0.0894493  -0.24307919  0.62462342 -0.64359999 -0.60949004 -0.28189212
  0.10141974  0.04723486  0.10617806 -0.38435671 -0.06178945  0.34719139
 -0.0787686  -0.37400493  0.30089563  0.07211375 -0.30796221 -0.83016247
  0.12074362  0.31220281 -0.30259672 -0.24050443 -0.07572179  0.11171935
  0.110025   -0.46797606  0.43022606 -0.76450604 -0.72874886  0.18232857
  0.30982938 -0.68525016 -0.17264126 -0.35886112  0.07418388 -0.62172979
 -0.14062409 -0.26347929  0.36110711 -0.32419255 -0.16927409 -0.49756539
 -1.01490712 -0.14805599  0.03516015  0.31928185 -0.32309985  0.27387601
 -0.20204893 -0.28384915  0.09221364  0.10028678  0.16869949  0.48547807
  0.24732609  0.18489729 -0.3441678   0.14106931 -0.00362062 -0.52048945
 -0.16203591 -0.27741349 -0.14308837  0.13238361 -0.17349912 -0.72280294
 -0.27882928  0.42570707  0.90160781  0.1893689   0.46965584  0.63614595
 -0.38009492 -0.30668283 -0