# 수치별 결과

## 로이터 데이터

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.datasets import reuters, imdb
from keras.preprocessing.text import Tokenizer

In [42]:
from google.colab import files
uploaded_file = files.upload()
filename = list(uploaded_file.keys())[0]

Saving spam.csv to spam.csv


In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [3]:
seed = 2021
np.random.seed(seed)
tf.random.set_seed(seed)

In [4]:
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, SimpleRNN, GRU
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split

### Reuter News 
- 1000, 100
- 5000, 500

In [33]:
# ornn = rnn 중 하나
def reuters_rnn(word_len, max_len,ornn,n_di=120,output=46):
    (X_train, y_train),(X_test, y_test) = reuters.load_data(num_words=word_len, test_split=0.2)
    # 모델 입력 기사의 길이를 max_len으로 맞춤
    X_train = pad_sequences(X_train, maxlen=max_len)
    X_test = pad_sequences(X_test, maxlen=max_len)
    # one-hot-incoding
    Y_train = to_categorical(y_train)
    Y_test = to_categorical(y_test)
    # LSTM 모델
    model = Sequential([
                        Embedding(word_len, 120),
                        rnn_dict[ornn](n_di),
                        Dense(output,activation='softmax')
    ])
    # Callback 함수 정의 - Best model, Early stopping
    es = EarlyStopping(monitor='val_loss', verbose=0, patience=4)
    mc = ModelCheckpoint(f'model/reuter_{ornn}_{word_len,max_len}_best_model.h5', 
                    verbose=0, save_best_only=True)
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    
    history = model.fit(X_train, Y_train, batch_size=100, epochs=50,
                    validation_split = 0.2, verbose=0,
                    callbacks=[mc,es])
    # Best Model 선택
    del model
    model = load_model(f'model/reuter_{ornn}_{word_len,max_len}_best_model.h5')
    print(f"\n{ornn.upper()} 빈도수{word_len}, {max_len}단어까지 테스트 정확도: {model.evaluate(X_test, Y_test, verbose=0)[1]}")

In [5]:
ornn_dict = {'srnn':SimpleRNN,'gru':GRU,'lstm':LSTM}

In [40]:
for i in [[1000,100],[5000,500]]:
    for key in ornn_dict.keys():
        reuters_rnn(i[0], i[1],key,n_di=120)

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])



RNN 빈도수1000, 100단어까지 테스트 정확도: 0.4919857382774353

GRU 빈도수1000, 100단어까지 테스트 정확도: 0.6656277775764465

LSTM 빈도수1000, 100단어까지 테스트 정확도: 0.6821014881134033

RNN 빈도수5000, 500단어까지 테스트 정확도: 0.4728406071662903

GRU 빈도수5000, 500단어까지 테스트 정확도: 0.6179875135421753

LSTM 빈도수5000, 500단어까지 테스트 정확도: 0.6527159214019775


### SMS Spam

In [6]:
from sklearn.model_selection import train_test_split

In [44]:
# 데이터 전처리
df = pd.read_csv(filename, encoding='latin1')
del df['Unnamed: 2']
del df['Unnamed: 3']
del df['Unnamed: 4']
# 수동 Label Encoding 
df['v1'] = df['v1'].replace(['ham','spam'],[0,1])
df = df.drop_duplicates('v2',keep='first')
X_data = df['v2'].values
y_data = df['v1'].values
t = Tokenizer()
t.fit_on_texts(X_data)
sequences = t.texts_to_sequences(X_data)
vocab_size = len(t.word_index) + 1
word_to_index = t.word_index
X_data = sequences
data = pad_sequences(X_data, maxlen=100)
X_train, X_test, y_train, y_test = train_test_split(
data, y_data, test_size = .2, random_state = 2021
)

In [56]:
# ornn = rnn 중 하나
def sms_spam_rnn(ornn,word_len=vocab_size, max_len=100,n_di=32,output=1):

    # LSTM 모델
    model = Sequential([
                        Embedding(word_len, 32),
                        rnn_dict[ornn](n_di),
                        Dense(output,activation='sigmoid')
    ])
    # Callback 함수 정의 - Best model, Early stopping
    es = EarlyStopping(monitor='val_loss', verbose=0, patience=4)
    mc = ModelCheckpoint(f'model/sms_spam_{ornn}_best_model.h5', 
                    verbose=0, save_best_only=True)
    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    
    history = model.fit(X_train, y_train, batch_size=60, epochs=60,
                    validation_split = 0.2, verbose=0,
                    callbacks=[mc,es])
    # Best Model 선택
    del model
    model = load_model(f'model/sms_spam_{ornn}_best_model.h5')
    print(f"\n{ornn.upper()} 테스트 정확도: {model.evaluate(X_test, y_test, verbose=0)[1]}")

In [57]:
for key in ornn_dict.keys():
    sms_spam_rnn(key)


RNN 테스트 정확도: 0.9816247820854187

GRU 테스트 정확도: 0.9845260977745056

LSTM 테스트 정확도: 0.988394558429718


### IMDB Review
- 5000,500
- 10000,1000

In [16]:
# ornn = rnn 중 하나
def imdb_rnn(word_len, max_len,ornn,n_di=120,output=1):
    (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=word_len)
    # 모델 입력 기사의 길이를 max_len으로 맞춤
    X_train = pad_sequences(X_train, maxlen=max_len)
    X_test = pad_sequences(X_test, maxlen=max_len)

    X_val, X_test, y_val, y_test = train_test_split(
    X_test, y_test, stratify=y_test, test_size=.6, random_state=seed
    )
    # LSTM 모델
    model = Sequential([
                        Embedding(word_len, 120),
                        ornn_dict[ornn](n_di),
                        Dense(output,activation='sigmoid')
    ])
    # Callback 함수 정의 - Best model, Early stopping
    es = EarlyStopping(monitor='val_loss', verbose=0, patience=10)
    mc = ModelCheckpoint(f'model/imdb{ornn}_{word_len,max_len}_best_model.h5', 
                    verbose=0, save_best_only=True)
    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    
    history = model.fit(X_train, y_train, batch_size=100, epochs=50, 
                    validation_data=(X_val, y_val), verbose=2,
                    callbacks=[mc, es])
    # Best Model 선택
    del model
    model = load_model(f'model/imdb{ornn}_{word_len,max_len}_best_model.h5')
    print(f"\n{ornn.upper()} 빈도수{word_len}, {max_len}단어까지 테스트 정확도: {model.evaluate(X_test1,Y_test1, verbose=0)[1]}")

In [None]:
for i in [[5000,500],[10000,1000]]:
    for key in ornn_dict.keys():
        imdb_rnn(i[0], i[1],key)

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


Epoch 1/50
250/250 - 65s - loss: 0.6459 - accuracy: 0.5970 - val_loss: 0.5788 - val_accuracy: 0.6921
Epoch 2/50
250/250 - 67s - loss: 0.5100 - accuracy: 0.7521 - val_loss: 1.5981 - val_accuracy: 0.5580
Epoch 3/50
250/250 - 68s - loss: 0.5235 - accuracy: 0.7481 - val_loss: 0.5347 - val_accuracy: 0.7357
Epoch 4/50
250/250 - 67s - loss: 0.5021 - accuracy: 0.7577 - val_loss: 0.5068 - val_accuracy: 0.7433
Epoch 5/50
250/250 - 67s - loss: 0.4663 - accuracy: 0.7860 - val_loss: 0.5784 - val_accuracy: 0.7064
Epoch 6/50
250/250 - 67s - loss: 0.4182 - accuracy: 0.8118 - val_loss: 0.4725 - val_accuracy: 0.7920
Epoch 7/50
250/250 - 64s - loss: 0.4114 - accuracy: 0.8110 - val_loss: 0.5984 - val_accuracy: 0.6655
Epoch 8/50
250/250 - 66s - loss: 0.4222 - accuracy: 0.8103 - val_loss: 0.6217 - val_accuracy: 0.7162
Epoch 9/50
250/250 - 64s - loss: 0.4075 - accuracy: 0.8115 - val_loss: 0.5066 - val_accuracy: 0.7791
Epoch 10/50
250/250 - 65s - loss: 0.3178 - accuracy: 0.8657 - val_loss: 0.4199 - val_accura