<a href="https://colab.research.google.com/github/bob8dod/NLP_SelfStudying/blob/main/%20Sentiment_Analysis_(using_only_Word_Embedding).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 워드 임베딩의 평균만을 이용해서 긍부정 분류

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.datasets import imdb

In [None]:
vocab_size = 20000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
print('훈련용 리뷰 개수 :',len(x_train))
print('테스트용 리뷰 개수 :',len(x_test))

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])


훈련용 리뷰 개수 : 25000
테스트용 리뷰 개수 : 25000


  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [None]:
print(x_train[0][:10]) # 이 데이터는 이미 정수 인코딩까지의 전처리가 진행됨

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]


In [None]:
print('훈련용 리뷰의 평규 길이: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('테스트용 리뷰의 평균 길이: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

훈련용 리뷰의 평규 길이: 238
테스트용 리뷰의 평균 길이: 230


In [None]:
# 400으로 두 리뷰 데이터를 패딩
max_len = 400

x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)
print('x_train의 크기(shape) :', x_train.shape)
print('x_test의 크기(shape) :', x_test.shape)

x_train의 크기(shape) : (25000, 400)
x_test의 크기(shape) : (25000, 400)


모델 설계

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=max_len))
model.add(GlobalAveragePooling1D()) # 모든 단어 벡터의 평균을 구한다.
model.add(Dense(1, activation='sigmoid'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('embedding_average_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.fit(x_train, y_train, batch_size=32, epochs=10, callbacks=[es, mc], validation_split=0.2)

Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.76660, saving model to embedding_average_model.h5
Epoch 2/10

Epoch 00002: val_acc improved from 0.76660 to 0.86540, saving model to embedding_average_model.h5
Epoch 3/10

Epoch 00003: val_acc improved from 0.86540 to 0.87720, saving model to embedding_average_model.h5
Epoch 4/10

Epoch 00004: val_acc improved from 0.87720 to 0.88340, saving model to embedding_average_model.h5
Epoch 5/10

Epoch 00005: val_acc improved from 0.88340 to 0.88840, saving model to embedding_average_model.h5
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.88840
Epoch 7/10

Epoch 00007: val_acc improved from 0.88840 to 0.89300, saving model to embedding_average_model.h5
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.89300
Epoch 9/10

Epoch 00009: val_acc improved from 0.89300 to 0.89440, saving model to embedding_average_model.h5
Epoch 10/10

Epoch 00010: val_acc did not improve from 0.89440


<tensorflow.python.keras.callbacks.History at 0x7fae0f729f10>

In [None]:
loaded_model = load_model('embedding_average_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(x_test, y_test)[1]))


 테스트 정확도: 0.8876


별 다른 신경망을 추가하지 않고, 단어 벡터의 평균만으로도 88.76%라는 높은 정확도