# IMDB 영화 리뷰 감성 분석 - LSTM

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import imdb
import warnings
warnings.filterwarnings('ignore')

In [2]:
(X_train, y_train), (X_test, y_test) = imdb.load_data()
X_train.shape, X_test.shape, y_train.shape

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


((25000,), (25000,), (25000,))

In [3]:
print('등장한 단어 수 : ', len(imdb.get_word_index()))
print('리뷰 최대 길이 : ', max(len(s) for s in X_train))
print('리뷰 평균 길이 : ', sum(map(len, X_train)) / len(X_train))

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
등장한 단어 수 :  88584
리뷰 최대 길이 :  2494
리뷰 평균 길이 :  238.71364


### LSTM으로 IMDB 리뷰 감성 분석
- 단어 빈도수 : 10,000 (88,584)
- 문장의 단어수 : 500 (249)
- test data 중 40%(10,000)개는 검증용


In [4]:
import tensorflow as tf
seed = 2022
np.random.seed(seed)
tf.random.set_seed(seed)

num_words = 10000
max_len = 500

In [5]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [6]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=num_words)
X_train.shape, X_test.shape, y_train.shape

((25000,), (25000,), (25000,))

In [7]:
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
X_train.shape, X_test.shape

((25000, 500), (25000, 500))

In [8]:
from sklearn.model_selection import train_test_split

X_test, X_valid, y_test, y_valid = train_test_split(
    X_test, y_test, stratify=y_test, test_size=0.4, random_state=seed
)
X_test.shape, X_valid.shape, y_test.shape, y_valid.shape

((15000, 500), (10000, 500), (15000,), (10000,))

- case 1) LSTM
  - embedding dim = 100
  - hidden units = 128

In [9]:
embedding_dim = 100
hidden_units = 128

In [10]:
model1 = Sequential([
                     Embedding(num_words, embedding_dim, input_length=max_len),
                     LSTM(hidden_units),
                     Dense(1, activation='sigmoid')
])
model1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 100)          1000000   
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,117,377
Trainable params: 1,117,377
Non-trainable params: 0
_________________________________________________________________


In [11]:
model1.compile('adam', 'binary_crossentropy', ['accuracy'])

model_path = 'best-imdb-lstm.h5'
mc = ModelCheckpoint(model_path, save_best_only=True, verbose=1)
es = EarlyStopping(patience=5)

In [12]:
hist1 = model1.fit(
    X_train, y_train, epochs=30, batch_size=64, 
    validation_data=[X_valid, y_valid],
    callbacks=[mc,es]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 0.38394, saving model to best-imdb-lstm.h5
Epoch 2/30
Epoch 00002: val_loss improved from 0.38394 to 0.36891, saving model to best-imdb-lstm.h5
Epoch 3/30
Epoch 00003: val_loss improved from 0.36891 to 0.33342, saving model to best-imdb-lstm.h5
Epoch 4/30
Epoch 00004: val_loss did not improve from 0.33342
Epoch 5/30
Epoch 00005: val_loss did not improve from 0.33342
Epoch 6/30
Epoch 00006: val_loss did not improve from 0.33342
Epoch 7/30
Epoch 00007: val_loss did not improve from 0.33342
Epoch 8/30
Epoch 00008: val_loss did not improve from 0.33342


In [13]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.32989606261253357, 0.8610000014305115]