# Keras - IMDB 영화리뷰 감성분석(LSTM + CNN)
- Embedding
- Conv1D
    - Conv1D
    - Conv1D + conv1D
    - Conv1D + LSTM + Dense
    - Conv1D + Dense + Dense
- Maxpooling
- LSTM
- Dencs

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [5]:
import tensorflow as tf
seed = 2022
np.random.seed(seed)
tf.random.set_seed(seed)

In [6]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [7]:
# (X_train, y_train), (_, _) = imdb.load_data(num_words = 10000)
num_words = 10000
(X_train, y_train), (_, _) = imdb.load_data(num_words = num_words)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train, stratify = y_train, test_size = 0.2, random_state=seed
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((20000,), (5000,), (20000,), (5000,))

### pad_sequences

In [9]:
max_len = 500
X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)
X_train.shape, X_test.shape

((20000, 500), (5000, 500))

## Case1. Conv1D + Conv1D
- 단어 빈도 수 : 10,000개 (총 88,584)
- 리뷰 최대 단어 길이 : 500개 (총 2,984)


In [10]:
model1 = Sequential([
    Embedding(num_words, 120, input_length=max_len),
    Dropout(0.5),
    Conv1D(64, 7, activation = 'relu'),
    MaxPooling1D(7),
    Conv1D(64, 5, activation = 'relu'),
    MaxPooling1D(5),
    GlobalMaxPooling1D(),   #flatten
    Dense(1, activation='sigmoid')
])
model1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 120)          1200000   
                                                                 
 dropout (Dropout)           (None, 500, 120)          0         
                                                                 
 conv1d (Conv1D)             (None, 494, 64)           53824     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 70, 64)           0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 66, 64)            20544     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 13, 64)           0         
 1D)                                                    

In [11]:
model1.compile('adam', 'binary_crossentropy', ['accuracy'])

In [12]:
model_path1 = 'best-imbd-Conv1d_v1.h5'
mc1 = ModelCheckpoint(model_path1, verbose=1, save_best_only=True)
es1 = EarlyStopping(patience=5)

In [13]:
hist1 = model1.fit(X_train, y_train, validation_split = 0.2,
                 epochs = 30, batch_size = 60, callbacks = [mc1, es1])

Epoch 1/30
Epoch 1: val_loss improved from inf to 0.35939, saving model to best-imbd-Conv1d_v1.h5
Epoch 2/30
Epoch 2: val_loss improved from 0.35939 to 0.32018, saving model to best-imbd-Conv1d_v1.h5
Epoch 3/30
Epoch 3: val_loss did not improve from 0.32018
Epoch 4/30
Epoch 4: val_loss did not improve from 0.32018
Epoch 5/30
Epoch 5: val_loss did not improve from 0.32018
Epoch 6/30
Epoch 6: val_loss did not improve from 0.32018
Epoch 7/30
Epoch 7: val_loss did not improve from 0.32018


### result : 86.60

In [14]:
best_model1 = load_model(model_path1)
best_model1.evaluate(X_test, y_test)



[0.31354641914367676, 0.8659999966621399]

## Case2. Conv1D + LSTM
- 단어 빈도 수 : 10,000개 (총 88,584)
- 리뷰 최대 단어 길이 : 500개 (총 2,984)


In [15]:
model2 = Sequential([
    Embedding(num_words, 120, input_length=max_len),
    Dropout(0.5),
    Conv1D(64, 5, activation = 'relu'),
    MaxPooling1D(5),
    LSTM(100),
    Dense(1, activation='sigmoid')
])
model2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 120)          1200000   
                                                                 
 dropout_1 (Dropout)         (None, 500, 120)          0         
                                                                 
 conv1d_2 (Conv1D)           (None, 496, 64)           38464     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 99, 64)           0         
 1D)                                                             
                                                                 
 lstm (LSTM)                 (None, 100)               66000     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                      

In [16]:
model2.compile('adam', 'binary_crossentropy', ['accuracy'])

model_path2 = 'best-imbd-Conv1d_v2.h5'
mc2 = ModelCheckpoint(model_path2, verbose=1, save_best_only=True)
es2 = EarlyStopping(patience=5)

hist2 = model2.fit(X_train, y_train, validation_split = 0.2,
                 epochs = 30, batch_size = 60, callbacks = [mc2, es2])

Epoch 1/30
Epoch 1: val_loss improved from inf to 0.32012, saving model to best-imbd-Conv1d_v2.h5
Epoch 2/30
Epoch 2: val_loss improved from 0.32012 to 0.26013, saving model to best-imbd-Conv1d_v2.h5
Epoch 3/30
Epoch 3: val_loss did not improve from 0.26013
Epoch 4/30
Epoch 4: val_loss did not improve from 0.26013
Epoch 5/30
Epoch 5: val_loss did not improve from 0.26013
Epoch 6/30
Epoch 6: val_loss did not improve from 0.26013
Epoch 7/30
Epoch 7: val_loss did not improve from 0.26013


### result : 88.89

In [17]:
best_model2 = load_model(model_path2)
best_model2.evaluate(X_test, y_test)



[0.2742072641849518, 0.8888000249862671]

## Case3. Conv1D + Dense
- 단어 빈도 수 : 10,000개 (총 88,584)
- 리뷰 최대 단어 길이 : 500개 (총 2,984)


In [18]:
model3 = Sequential([
    Embedding(num_words, 120, input_length=max_len),
    Dropout(0.5),
    Conv1D(64, 5, activation = 'relu'),
    MaxPooling1D(5),
    GlobalMaxPooling1D(),
    Dense(100, activation='relu'),
    Dense(1, activation='sigmoid')
])
model3.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 500, 120)          1200000   
                                                                 
 dropout_2 (Dropout)         (None, 500, 120)          0         
                                                                 
 conv1d_3 (Conv1D)           (None, 496, 64)           38464     
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 99, 64)           0         
 1D)                                                             
                                                                 
 global_max_pooling1d_1 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 100)              

In [19]:
model3.compile('adam', 'binary_crossentropy', ['accuracy'])

model_path3 = 'best-imbd-Conv1d_v3.h5'
mc3 = ModelCheckpoint(model_path3, verbose=1, save_best_only=True)
es3 = EarlyStopping(patience=5)

hist3 = model3.fit(X_train, y_train, validation_split = 0.2,
                 epochs = 30, batch_size = 60, callbacks = [mc3, es3])

Epoch 1/30
Epoch 1: val_loss improved from inf to 0.37521, saving model to best-imbd-Conv1d_v3.h5
Epoch 2/30
Epoch 2: val_loss improved from 0.37521 to 0.28540, saving model to best-imbd-Conv1d_v3.h5
Epoch 3/30
Epoch 3: val_loss improved from 0.28540 to 0.27936, saving model to best-imbd-Conv1d_v3.h5
Epoch 4/30
Epoch 4: val_loss did not improve from 0.27936
Epoch 5/30
Epoch 5: val_loss did not improve from 0.27936
Epoch 6/30
Epoch 6: val_loss did not improve from 0.27936
Epoch 7/30
Epoch 7: val_loss did not improve from 0.27936
Epoch 8/30
Epoch 8: val_loss did not improve from 0.27936


### result : 88.70

In [20]:
best_model3 = load_model(model_path3)
best_model3.evaluate(X_test, y_test)



[0.286623477935791, 0.8870000243186951]