In [1]:
import multiprocessing

%load_ext autoreload
%autoreload 2

# my lib
import textlib as tl
import Word2VecModel as wv
import FastTextModel as ft



## 병렬 수행을 위한 core 개수 가져오기 (모든 코어를 사용하면 다른 작업이 불가하여 -1)

In [2]:
WORKERS = multiprocessing.cpu_count() - 1

## GPU 사용 가능 여부 확인

In [None]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

## 파일 경로 설정

In [3]:
# 말뭉치 경로
corpora_prttag_file_name = '../TCL2021_Telco_Embedding_Dataset/corpora/telco_corpora.dat'
corpora_alltag_file_name = '../TCL2021_Telco_Embedding_Dataset/corpora/telco_corpora_all_tag.dat'

# 일부 tag(명사, 형용사 계열)로만 만든 embedding vector를 저장할 경로
w2v_model_prttag_file_name_prefix = '../TCL2021_Telco_Embedding_Dataset/embedding_w2v/telco_w2v_'

# 모든 tag로 만든 embedding vector를 저장할 경로
w2v_model_alltag_file_name_prefix = '../TCL2021_Telco_Embedding_Dataset/embedding_w2v_alltag/telco_w2v_'

# 모든 tag로 만든 fasttext embedding vector를 저장할 경로
ft_model_file_name_prefix = '../TCL2021_Telco_Embedding_Dataset/embedding_fasttext/telco_ft_'


## 일부 형태소 사용할지, 전체 형태소 사용할지 여부에 따라 아래 코드 변경하여 사용

In [4]:
# w2v 모든 형태소 사용
w2v_model_file_name_prefix = w2v_model_alltag_file_name_prefix
corpora_file_name = corpora_alltag_file_name

# w2v 일부 형태소 사용
#w2v_model_file_name_prefix = w2v_model_prttag_file_name_prefix
#corpora_file_name = corpora_prttag_file_name

# fasttext
#w2v_model_file_name_prefix = ft_model_file_name_prefix
#corpora_file_name = corpora_alltag_file_name

## 말뭉치 통계 확인

In [None]:
corpus = [sentence.strip().split(' ') 
                  for sentence in open(corpora_file_name, 'r', encoding='utf-8').readlines()]


In [None]:
import nltk
from itertools import chain
fdist = nltk.FreqDist(chain.from_iterable(corpus))


In [None]:
for i in range(1000, 40000, 1000):
    print( str(i) + '------>' + str(fdist.most_common(i)[-1]) )

## 단어수는 10,000개로 고정. embedding 차원수 [128,256,384], window size [3,4,5]의 조합으로 총 9개 embedding 생성

In [5]:
# 여러개의 w2v 모델을 만들기 위한 table
MODEL_COUNT = 9

W2V_TRAIN_PARAMS = {
    'MODEL_NAME': ['V10000_E128_W3','V10000_E128_W4','V10000_E128_W5',
                   'V10000_E256_W3','V10000_E256_W4','V10000_E256_W5',
                   'V10000_E384_W3','V10000_E384_W4','V10000_E384_W5'],
    'MAX_VOCAB_SIZE': [10000] * MODEL_COUNT,
    'EMBEDDING_SIZE': [128,128,128, 256,256,256, 384,384,384],
    'WINDOW_SIZE' : [3,4,5, 3,4,5, 3,4,5],
    #'EPOCHS': [20, 30, 40,  20, 30, 40,  20, 30, 40]
    'EPOCHS': [50] * MODEL_COUNT
}

# parameter 잘못 넣었는지 검증
assert len(W2V_TRAIN_PARAMS['MODEL_NAME']) == MODEL_COUNT
assert len(W2V_TRAIN_PARAMS['MAX_VOCAB_SIZE']) == MODEL_COUNT
assert len(W2V_TRAIN_PARAMS['EMBEDDING_SIZE']) == MODEL_COUNT
assert len(W2V_TRAIN_PARAMS['WINDOW_SIZE']) == MODEL_COUNT
assert len(W2V_TRAIN_PARAMS['EPOCHS']) == MODEL_COUNT

## model_count만큼 loop돌며 n개의 embedding training.

## picked_model_index 지정하면 해당 index에 해당되는 parameter를 읽어 하나의 embedding 생성(테스트용)

In [7]:
def create_multi_w2v_model(picked_model_index, params):
    for i, (model_name, max_vocab_size, embedding_size, window_size, epochs) in \
            enumerate(zip(params['MODEL_NAME'],
                          params['MAX_VOCAB_SIZE'],
                          params['EMBEDDING_SIZE'],
                          params['WINDOW_SIZE'],
                          params['EPOCHS'])):
        if picked_model_index == -1:
            pass
        elif picked_model_index != i:
            continue
        
        print(f'---- {i} 시작!! ----')
        w2v_model = wv.Word2VecModel()
        w2v_model.create(corpora_file_name, 
                           w2v_model_file_name_prefix + model_name, 
                           max_vocab_size=max_vocab_size, 
                           embedding_size=embedding_size,
                           epochs=epochs,
                           window=window_size,
                           workers=WORKERS)    


In [8]:
# 테스트로 하나만 만들자.
ndx = -1
create_multi_w2v_model(ndx, W2V_TRAIN_PARAMS)

# 모든 파라메터에 대해 만드려면 picked_model_index를 -1로
#create_multi_w2v_model(-1, W2V_TRAIN_PARAMS)

---- 0 시작!! ----
10000개의 단어 내에서 최소 빈도수는 49입니다.
Epoch: 1	Loss after epoch 1: current loss : 17138032.0, previous loss : 0, diff : 17138032.0 
Epoch: 2	Loss after epoch 2: current loss : 30356626.0, previous loss : 17138032.0, diff : 13218594.0 
Epoch: 3	Loss after epoch 3: current loss : 40177552.0, previous loss : 30356626.0, diff : 9820926.0 
Epoch: 4	Loss after epoch 4: current loss : 48855448.0, previous loss : 40177552.0, diff : 8677896.0 
Epoch: 5	Loss after epoch 5: current loss : 57476980.0, previous loss : 48855448.0, diff : 8621532.0 
Epoch: 6	Loss after epoch 6: current loss : 66050640.0, previous loss : 57476980.0, diff : 8573660.0 
Epoch: 7	Loss after epoch 7: current loss : 68428624.0, previous loss : 66050640.0, diff : 2377984.0 
Epoch: 8	Loss after epoch 8: current loss : 69950336.0, previous loss : 68428624.0, diff : 1521712.0 
Epoch: 9	Loss after epoch 9: current loss : 71434176.0, previous loss : 69950336.0, diff : 1483840.0 
Epoch: 10	Loss after epoch 10: current los

KeyboardInterrupt: 

In [None]:
# 테스트로 0번째 모델 한번 가져와보자.
w2v_model = wv.Word2VecModel()
w2v_model.load(w2v_model_file_name_prefix + W2V_TRAIN_PARAMS['MODEL_NAME'][ndx])

print( len(w2v_model.index2word) )
print( len(w2v_model.word2index) )
print( len(w2v_model.weight) )

print( w2v_model.index2word[200] )
print( w2v_model.word2index['약정'] )
print( w2v_model.weight[2583] )
print( w2v_model.norm_weight[2583] )

# 이 밑은 fasttext

In [12]:
def create_multi_fasttext_model(picked_model_index, params):
    for i, (model_name, max_vocab_size, embedding_size, window_size, epochs) in \
            enumerate(zip(params['MODEL_NAME'],
                          params['MAX_VOCAB_SIZE'],
                          params['EMBEDDING_SIZE'],
                          params['WINDOW_SIZE'],
                          params['EPOCHS'])):
        if picked_model_index != i:
            continue        
        
        print(f'---- {i} 시작!! ----')
        ft_model = ft.FastTextModel()
        ft_model.create(corpora_file_name, 
                           ft_model_file_name_prefix + model_name, 
                           max_vocab_size=max_vocab_size, 
                           embedding_size=embedding_size,
                           epochs=epochs, 
                           window=window_size,
                           workers=WORKERS)    

In [15]:
# 테스트로 하나만 만들자.
ndx = 8
create_multi_fasttext_model(ndx, W2V_TRAIN_PARAMS)

---- 8 시작!! ----
10000개의 단어 내에서 최소 빈도수는 49입니다.
Epoch: 1	Loss after epoch 1: current loss : 0.0, previous loss : 0, diff : 0.0 


In [None]:
# 테스트로 ndx번째 모델 한번 가져와보자. - fasttext
w2v_model = ft.FastTextModel()
w2v_model.load(ft_model_file_name_prefix + W2V_TRAIN_PARAMS['MODEL_NAME'][ndx])

print( len(w2v_model.index2word) )
print( len(w2v_model.word2index) )
print( len(w2v_model.weight) )

print( w2v_model.index2word[200] )
print( w2v_model.word2index['약정'] )
print( w2v_model.weight[2583] )
print( w2v_model.norm_weight[2583] )