In [1]:
from gensim.models import Word2Vec
import numpy as np
from sklearn.preprocessing import normalize
import multiprocessing
from eunjeon import Mecab

In [2]:
raw_sentence_file_name = './dataset/nsmc/processed/processed_ratings.txt'
tokenized_sentence_file_name = './dataset/nsmc/processed/processed_ratings_tokenized.txt'
w2v_model_file_name = './dataset/nsmc/model/w2v/w2v'

MAX_VOCAB_SIZE = 10000
EMBEDDING_SIZE = 100
WORKERS = multiprocessing.cpu_count() - 1
WINDOW_SIZE = 5
EPOCHS = 10

In [16]:
UNK_TOKEN = '<UNK>'
PAD_TOKEN = '<PAD>'
tokenizer = Mecab()

In [4]:
def create_tokenized_sentence_file(raw_file_name, out_file_name, tokenizer):    
    print(f'Tokenizing을 시작합니다. {raw_file_name}')
    with open(raw_file_name, 'r', encoding='utf-8') as fi, \
        open(out_file_name, 'w', encoding='utf-8') as fo :
        for i, sentence in enumerate(fi):
            tokenized_words = tokenizer.morphs(sentence.replace('\n', '').strip())
            fo.writelines(' '.join(tokenized_words) + '\n')
            
            if i % 10000 == 0 and i > 0:
                print(f'{i} 번째 tokenizing이 완료되었습니다.')
    
    print(f'Tokenizing이 완료되었습니다. {out_file_name}')                
    

In [None]:
create_tokenized_sentence_file(
    raw_sentence_file_name, tokenized_sentence_file_name, Mecab())

In [None]:
def make_w2v_model(in_file_name, out_file_name,
                   max_vocab_size=10000, embedding_size=100,
                   epochs=10, window=5, workers=3):
    # 빈도수 상위 vocab_size 내에 존재하는 단어 중 최소 빈도수를 구함
    def get_min_freq_count(sentences, max_freq_rank):
        from itertools import chain
        import nltk

        fdist = nltk.FreqDist(chain.from_iterable(sentences))
        return fdist.most_common(max_freq_rank)[-1][1] # the count of the the top-kth word

    # 단어 모음
    corpus = [sentence.strip().split(' ') 
              for sentence in open(in_file_name, 'r', encoding='utf-8').readlines()]
    # 빈도수 상위 n위의 최소빈도수 구함 (word2vec 훈련 시 그 이하는 버리기 위함)
    min_freq_cnt = get_min_freq_count(corpus, max_vocab_size)
    
    # gensim word2vec call
    w2v_model = Word2Vec(corpus, 
                     size=embedding_size, 
                     workers=workers, 
                     min_count=min_freq_cnt,
                     sg=1, 
                     iter=epochs,
                     window=window)
    # 저장
    w2v_model.save(model_fname)        
    
    return _post_w2v_process(w2v_model)   
    

In [5]:
def _post_w2v_process(w2v_model):    
    # unknown, padding 토큰 추가
    def _append_unk_pad_vectors(embeddings):
        embedding_size = embeddings.shape[1]
        def get_truncated_normal(mean=0, sd=1, low=-1, upp=1):
            from scipy.stats import truncnorm
            return truncnorm(
                    (low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)

        return np.append(embeddings, 
                         get_truncated_normal().rvs(embedding_size * 2).reshape(2, embedding_size), axis=0)    
        
    index2word = w2v_model.wv.index2word
    # unk, pad 추가
    index2word.append( UNK_TOKEN )
    index2word.append( PAD_TOKEN )    
    
    w2v = w2v_model.wv.vectors
    # unk, pad에 해당하는 normal 초기화된 벡터 추가
    w2v = _append_unk_pad_vectors(w2v)
    # cosine유사도 체크를 위해 normalize
    unit_w2v = normalize(w2v, norm='l2', axis=1)

    # word를 index로 변환
    word2index = {w:i for i, w in enumerate(index2word)}
    # 사전. word를 vector로 변환
    dictionary = {w:v for w, v in zip(index2word, unit_w2v)}    
    
    return w2v_model, index2word, word2index, dictionary, unit_w2v

In [None]:
# word2vec 모델 만들고 저장. 그 외 필요한 값들 리턴
w2v_model, index2word, word2index, dictionary, unit_w2v = \
    make_w2v_model(tokenized_sentence_file_name, 
                   w2v_model_file_name, 
                   max_vocab_size=MAX_VOCAB_SIZE, 
                   embedding_size=EMBEDDING_SIZE,
                   epochs=EPOCHS,
                   window=WINDOW_SIZE,
                   workers=WORKERS)

In [8]:
print( len(index2word) )
print( len(word2index) )
print( len(dictionary) )

print( index2word[200] )
print( word2index['송강호'] )
print( dictionary['송강호'] )

10103
10103
10103
합니다
2583
[ 0.04488257 -0.08638541 -0.11681433  0.20049859  0.10155382  0.13070424
 -0.01930784  0.01697905 -0.18308764 -0.01801016 -0.04513846 -0.03508925
 -0.03764806  0.05366753 -0.00871233  0.05305303  0.03162769 -0.16336043
 -0.10599274 -0.04344547  0.01514467  0.07553398 -0.05830617 -0.0450295
 -0.05237505 -0.06838317 -0.01582227 -0.13957131 -0.10054326 -0.11457788
 -0.13638287  0.01462429 -0.04491218  0.03531386  0.10436558 -0.0394524
 -0.1022159   0.14179352  0.00759276  0.19943246 -0.12810362 -0.01620154
  0.09475664  0.01782707  0.09488865 -0.22739975 -0.13044281  0.01617008
  0.07780829 -0.06126102 -0.19827182  0.02977186  0.022885   -0.08928509
  0.27417429  0.05611684 -0.10096212  0.04752864 -0.06542809 -0.22387225
 -0.08617626  0.17361759 -0.10983343 -0.08201186 -0.03511096  0.00191257
 -0.06977492 -0.08441662  0.05033234  0.0575619  -0.03810816  0.15659335
  0.13429893  0.0512108   0.04728433  0.150782    0.11611874  0.03913846
  0.04445213  0.15805049  

In [6]:
# 훈련 완료된 모델 있으면 로드해서 쓴다.
def load_w2v_model(model_file_name):
    w2v_model = Word2Vec.load(model_file_name)

    return _post_w2v_process(w2v_model)

In [7]:
# 훈련 완료된 모델 있으면 로드해서 쓴다.
w2v_model, index2word, word2index, dictionary, unit_w2v = \
    load_w2v_model(w2v_model_file_name)

In [43]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Activation
from keras.preprocessing.sequence import pad_sequences

In [None]:
'''
model = Sequential()
model.add(w2v_model.wv.get_keras_embedding(train_embeddings=False))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid')) # 출력층
model.summary()
'''

In [12]:
train_fname = './dataset/nsmc/processed/processed_ratings_train.txt'
test_fname = './dataset/nsmc/processed/processed_ratings_test.txt'

In [17]:
def load_dataset(filename):
    X, y = [], []
    tokenizer = Mecab()
    with open(filename, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            # 문장과 label 읽음
            sentence, label = line.strip().split('\u241E')
            # 문장 tokenizing
            tokenized_sentence = tokenizer.morphs(sentence.replace('\n', '').strip())
            # word -> vecs
            sequence = [word2index[t] if t in word2index else word2index[ UNK_TOKEN ]
                                    for t in tokenized_sentence]
            X.append( sequence )
            y.append( int(label) )

    # 최대 길이로 padding
    X = pad_sequences( X, maxlen=20, padding='post', value=word2index[PAD_TOKEN] )
    X = [unit_w2v[x] for x in X]
    return np.array(X), np.array(y)

        

In [18]:
train_X, train_y = load_dataset(train_fname)
test_X, test_y = load_dataset(test_fname)

print(train_X.shape)
print(train_y.shape)
print(test_X.shape)
print(test_y.shape)

(149995, 20, 100)
(149995,)
(49997, 20, 100)
(49997,)


In [19]:
print( train_X[0] )

[[ 0.09529088  0.01429527 -0.03840789 ...  0.0364302   0.16703626
  -0.04981438]
 [ 0.0948733  -0.10464905 -0.07920648 ... -0.20888194  0.02951386
   0.06941504]
 [-0.12498525  0.01683704 -0.02195003 ...  0.21083888 -0.00475559
  -0.05342693]
 ...
 [-0.13996338 -0.17214974  0.1475329  ...  0.17202141  0.00136334
   0.10124371]
 [-0.13996338 -0.17214974  0.1475329  ...  0.17202141  0.00136334
   0.10124371]
 [-0.13996338 -0.17214974  0.1475329  ...  0.17202141  0.00136334
   0.10124371]]


In [29]:
input_dim, output_dim = w2v_model.wv.vectors.shape

In [46]:
train_y.shape

(149995,)

In [44]:
model = Sequential()

model.add(Embedding(
    input_dim=input_dim, 
    output_dim=output_dim, 
    weights=[w2v_model.wv.vectors]))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(2))
model.add(Activation('softmax'))
model.summary()


Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 100)         1010100   
_________________________________________________________________
dense_9 (Dense)              (None, None, 128)         12928     
_________________________________________________________________
dense_10 (Dense)             (None, None, 64)          8256      
_________________________________________________________________
dense_11 (Dense)             (None, None, 2)           130       
_________________________________________________________________
activation (Activation)      (None, None, 2)           0         
Total params: 1,031,414
Trainable params: 1,031,414
Non-trainable params: 0
_________________________________________________________________


In [45]:

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.fit(train_X, train_y, epochs=5)


Epoch 1/5


InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument:  logits and labels must have the same first dimension, got logits shape [64000,2] and labels shape [32]
	 [[node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits (defined at <ipython-input-45-f3542110fdb7>:4) ]]
	 [[gradient_tape/sequential_4/embedding_4/embedding_lookup/Reshape/_60]]
  (1) Invalid argument:  logits and labels must have the same first dimension, got logits shape [64000,2] and labels shape [32]
	 [[node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits (defined at <ipython-input-45-f3542110fdb7>:4) ]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_2545]

Function call stack:
train_function -> train_function


In [None]:
model.evaluate(test_X, test_y, verbose=2)