# RNN

## 학습 목표

- RNN

## 데이터

In [1]:
!wget -c https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip

--2019-03-29 12:33:43--  https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.249
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84188 (82K) [application/zip]
Saving to: ‘sentiment labelled sentences.zip’


2019-03-29 12:33:43 (531 KB/s) - ‘sentiment labelled sentences.zip’ saved [84188/84188]



In [0]:
!unzip -q sentiment\ labelled\ sentences.zip

## 읽기

In [0]:
import pandas as pd

In [0]:
df = pd.read_csv('sentiment labelled sentences/imdb_labelled.txt',
                 sep='\t',
                 header=None)

In [0]:
df.columns = ['review', 'score']

In [9]:
df.head()

Unnamed: 0,review,score
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


## 텐서플로

In [0]:
import tensorflow as tf

In [0]:
tf.enable_eager_execution()

## 토큰화

In [0]:
tok = tf.keras.preprocessing.text.Tokenizer()

In [0]:
tok.fit_on_texts(df['review'])

## 단어 번호

In [14]:
tok.word_index['fun']

354

In [16]:
tok.index_word[1]

'the'

## 텍스트를 단어 번호열로 변환

In [0]:
seq = tok.texts_to_sequences(df['review'])

In [19]:
df.loc[0, 'review']

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

In [18]:
seq[0]

[3, 28, 28, 28, 287, 407, 1216, 12, 37, 3, 1217, 1218, 408, 143]

## 패딩

In [0]:
MAXLEN = max(len(s) for s in seq)

In [0]:
pad = tf.keras.preprocessing.sequence.pad_sequences(seq, MAXLEN)

In [23]:
pad.shape

(748, 1400)

## 데이터 분할

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    pad, df['score'], test_size=.2, random_state=1234)

## 모형

In [0]:
NUM_WORDS = len(tok.index_word) + 1

In [28]:
rnn = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=NUM_WORDS,
                              output_dim=8,
                              input_length=MAXLEN,
                              mask_zero=True),
    tf.keras.layers.LSTM(16, return_sequences=False),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

Instructions for updating:
Colocations handled automatically by placer.


## 컴파일

In [0]:
rnn.compile(optimizer=tf.train.AdamOptimizer(0.001),
            loss='binary_crossentropy',
            metrics=['accuracy'])

## 훈련

In [30]:
rnn.fit(X_train, y_train, epochs=10)

Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f31c6545b00>

## 예측

In [0]:
y_rnn = rnn.predict_classes(X_test)

## 정확도

In [0]:
from sklearn.metrics import accuracy_score

In [34]:
accuracy_score(y_test, y_rnn)

0.7066666666666667

## 저장

In [35]:
rnn.save('rnn.h5')

