# 10-8 사전 훈련된 워드 임베딩(Pre-trained Word Embedding)

# GloVe

## 1. 전처리

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [2]:
sentences = ['nice great best amazing', 'stop lies', 'pitiful nerd', 'excellent work', 'supreme quality', 'bad', 'highly respectable']
y = [1,0,0,1,1,0,1] # 1은 긍정, 0은 부정

In [4]:
t = Tokenizer()
t.fit_on_texts(sentences)
vocab_size = len(t.word_index)+1
print(vocab_size)

16


In [7]:
# 정수 인코딩
X_encoded = t.texts_to_sequences(sentences)
X_encoded

[[1, 2, 3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13], [14, 15]]

In [8]:
max_len = max(len(l) for l in X_encoded)
max_len

4

In [10]:
# 패딩
X_train = pad_sequences(X_encoded, maxlen=max_len, padding='post')
y_train = np.array(y)
X_train

array([[ 1,  2,  3,  4],
       [ 5,  6,  0,  0],
       [ 7,  8,  0,  0],
       [ 9, 10,  0,  0],
       [11, 12,  0,  0],
       [13,  0,  0,  0],
       [14, 15,  0,  0]], dtype=int32)

## 2. Model

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential()
model.add(Embedding(vocab_size, 4, input_length=max_len))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 0s - loss: 0.6966 - acc: 0.5714
Epoch 2/100
1/1 - 0s - loss: 0.6950 - acc: 0.5714
Epoch 3/100
1/1 - 0s - loss: 0.6933 - acc: 0.5714
Epoch 4/100
1/1 - 0s - loss: 0.6916 - acc: 0.5714
Epoch 5/100
1/1 - 0s - loss: 0.6899 - acc: 0.5714
Epoch 6/100
1/1 - 0s - loss: 0.6883 - acc: 0.5714
Epoch 7/100
1/1 - 0s - loss: 0.6866 - acc: 0.5714
Epoch 8/100
1/1 - 0s - loss: 0.6849 - acc: 0.5714
Epoch 9/100
1/1 - 0s - loss: 0.6833 - acc: 0.5714
Epoch 10/100
1/1 - 0s - loss: 0.6816 - acc: 0.5714
Epoch 11/100
1/1 - 0s - loss: 0.6799 - acc: 0.5714
Epoch 12/100
1/1 - 0s - loss: 0.6783 - acc: 0.5714
Epoch 13/100
1/1 - 0s - loss: 0.6766 - acc: 0.5714
Epoch 14/100
1/1 - 0s - loss: 0.6750 - acc: 0.5714
Epoch 15/100
1/1 - 0s - loss: 0.6733 - acc: 0.7143
Epoch 16/100
1/1 - 0s - loss: 0.6717 - acc: 0.7143
Epoch 17/100
1/1 - 0s - loss: 0.6700 - acc: 0.8571
Epoch 18/100
1/1 - 0s - loss: 0.6683 - acc: 0.8571
Epoch 19/100
1/1 - 0s - loss: 0.6667 - acc: 0.8571
Epoch 20/100
1/1 - 0s - loss: 0.6650 - a

<tensorflow.python.keras.callbacks.History at 0x7f173c0b1190>

## 2-2 전처리된 레이어 이용

In [15]:
import numpy as np
embedding_dict = dict()
f = open('glove.6B.100d.txt', encoding="utf8")

for line in f:
    word_vector = line.split()
    word = word_vector[0] # 단어
    word_vector_arr = np.asarray(word_vector[1:], dtype='float32') # 임베딩 벡터: 100개의 값을 가지는 array로 변환
    embedding_dict[word] = word_vector_arr
f.close()
print('%s개의 Embedding vector가 있습니다.' % len(embedding_dict))

400000개의 Embedding vector가 있습니다.


In [16]:
embedding_matrix = np.zeros((vocab_size, 100))
# 단어 집합 크기의 행과 100개의 열을 가지는 행렬 생성. 값은 전부 0으로 채워진다.
np.shape(embedding_matrix)

(16, 100)

In [17]:
for word, i in t.word_index.items(): # 훈련 데이터의 단어 집합에서 단어를 1개씩 꺼내온다.
    temp = embedding_dict.get(word) # 단어(key) 해당되는 임베딩 벡터의 100개의 값(value)를 임시 변수에 저장
    if temp is not None:
        embedding_matrix[i] = temp # 임수 변수의 값을 단어와 매핑되는 인덱스의 행에 삽입

## Model2

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False) # trainable=False: 학습X
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 0s - loss: 0.8060 - acc: 0.2857
Epoch 2/100
1/1 - 0s - loss: 0.7804 - acc: 0.2857
Epoch 3/100
1/1 - 0s - loss: 0.7556 - acc: 0.2857
Epoch 4/100
1/1 - 0s - loss: 0.7316 - acc: 0.4286
Epoch 5/100
1/1 - 0s - loss: 0.7084 - acc: 0.5714
Epoch 6/100
1/1 - 0s - loss: 0.6861 - acc: 0.7143
Epoch 7/100
1/1 - 0s - loss: 0.6648 - acc: 0.7143
Epoch 8/100
1/1 - 0s - loss: 0.6443 - acc: 0.7143
Epoch 9/100
1/1 - 0s - loss: 0.6247 - acc: 0.8571
Epoch 10/100
1/1 - 0s - loss: 0.6059 - acc: 0.8571
Epoch 11/100
1/1 - 0s - loss: 0.5881 - acc: 0.8571
Epoch 12/100
1/1 - 0s - loss: 0.5710 - acc: 0.8571
Epoch 13/100
1/1 - 0s - loss: 0.5548 - acc: 0.8571
Epoch 14/100
1/1 - 0s - loss: 0.5393 - acc: 0.8571
Epoch 15/100
1/1 - 0s - loss: 0.5245 - acc: 0.8571
Epoch 16/100
1/1 - 0s - loss: 0.5104 - acc: 0.8571
Epoch 17/100
1/1 - 0s - loss: 0.4969 - acc: 0.8571
Epoch 18/100
1/1 - 0s - loss: 0.4841 - acc: 0.8571
Epoch 19/100
1/1 - 0s - loss: 0.4717 - acc: 0.8571
Epoch 20/100
1/1 - 0s - loss: 0.4599 - a

<tensorflow.python.keras.callbacks.History at 0x7f16061e34c0>

# Word2Vec

In [14]:
import numpy as np
import gensim

In [None]:
# 구글의 사전 훈련된 Word2vec 모델을 로드합니다.
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)  
print(word2vec_model.vectors.shape)

In [None]:
def get_vector(word):
    if word in word2vec_model:
        return word2vec_model[word]
    else:
        return None
    
for word, i in t.word_index.items(): # 훈련 데이터의 단어 집합에서 단어와 정수 인덱스를 1개씩 꺼내온다.
    temp = get_vector(word) # 단어(key) 해당되는 임베딩 벡터의 300개의 값(value)를 임시 변수에 저장
    if temp is not None: # 만약 None이 아니라면 임베딩 벡터의 값을 리턴받은 것이므로
        embedding_matrix[i] = temp # 해당 단어 위치의 행에 벡터의 값을 저장한다.

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential()
e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_len, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)