# 用RNN做情意分析

In [0]:
%env KERAS_BACKEND = tensorflow

In [0]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

### 讀入IMDB電影數據庫

In [0]:
from keras.datasets import imdb

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

colab讀入資料時會發生錯誤，需要更新numpy版本。 [Error in Keras Imdb load_data](https://github.com/tensorflow/tensorflow/issues/28102)

`!pip install numpy==1.16.2`

In [0]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)

In [0]:
print('訓練總筆數:', len(x_train))
print('測試總筆數:', len(x_test))

### 輸入的資料
每個數字代表一個英文單字，出現頻率越高數字越小。

In [0]:
# x_train[24999]

### 輸出資料的部分
0為負評 

1為正評

In [0]:
y_train[:10]

### 將輸入放進神經網路中
* 設定輸入文字長度的上限
* 將每段文字設定為一樣長，太短者補0。

In [0]:
from keras.preprocessing import sequence

In [0]:
x_train = sequence.pad_sequences(x_train,maxlen=150)
x_test = sequence.pad_sequences(x_test,maxlen=150)

In [0]:
x_train.shape

### 打造RNN (課堂範例 model)
* 先將 10000 維的文字壓到 N 維
* 然後用 K 個 LSTM 神經元做隱藏層
* 最後一個 output, 直接用 sigmoid 送出

In [0]:
N = 3 # 文字要壓到 N 維
K = 4 # LSTM 有 K 個神經元

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

In [0]:
model = Sequential()
model.add(Embedding(10000, N))

# K個LSTM的cell
model.add(LSTM(K))
model.add(Dense(1, activation='sigmoid'))

In [0]:
model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [0]:
model.summary()

每個cell有3個input 加上 4個前一次的h，等同7個input。

4個神經元有4個bias

因此一個LSTM有 4*7 + 4個

共有K個LSTM => (4*7 + 4) * K

### 訓練

In [0]:
model.fit(x_train, y_train, batch_size=32, epochs=5)

In [0]:
score = model.evaluate(x_test, y_test)
print(f'測試資料的 loss = {score[0]}')
print(f'測試資料正確率 = {score[1]}')

### 修改文字維度以及LSTM的神經元個數

### model1

In [0]:
N1 = 100
K1 = 5

model_1 = Sequential()
model_1.add(Embedding(10000,N1))
model_1.add(LSTM(K1))
model_1.add(Dense(1, activation='sigmoid'))
model_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_1.summary()

In [0]:
(4 * (N1 + K1) + 4) * K1

In [0]:
model_1.fit(x_train, y_train ,  batch_size=32 , epochs=5)

In [0]:
score_1 = model_1.evaluate(x_test, y_test)
print(f'測試資料的 loss = {score_1[0]}')
print(f'測試資料正確率 = {score_1[1]}')

### model2

In [0]:
N2 = 100
K2 = 10

model_2 = Sequential()
model_2.add(Embedding(10000,N2))
model_2.add(LSTM(K2))
model_2.add(Dense(1, activation='sigmoid'))
model_2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_2.summary()

In [0]:
(4 * (N2 + K2) + 4) * K2

In [0]:
model_2.fit(x_train, y_train ,  batch_size=32 , epochs=5)

In [0]:
score_2 = model_2.evaluate(x_test, y_test)
print(f'測試資料的 loss = {score_2[0]}')
print(f'測試資料正確率 = {score_2[1]}')

### mode3

In [0]:
N3 = 500
K3 = 50

model_3 = Sequential()
model_3.add(Embedding(10000,N3))
model_3.add(LSTM(K3))
model_3.add(Dense(1, activation='sigmoid'))
model_3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_3.summary()

In [0]:
(4 * (N3 + K3) + 4) * K3

In [0]:
model_3.fit(x_train, y_train ,  batch_size=32 , epochs=5)

In [0]:
score_3 = model_3.evaluate(x_test, y_test)
print(f'測試資料的 loss = {score_3[0]}')
print(f'測試資料正確率 = {score_3[1]}')

### model4

In [0]:
N4 = 500
K4 = 100

model_4 = Sequential()
model_4.add(Embedding(10000,N4))
model_4.add(LSTM(K4))
model_4.add(Dense(1, activation='sigmoid'))
model_4.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_4.summary()

In [0]:
(4 * (N4 + K4) + 4) * K4

In [0]:
model_4.fit(x_train, y_train , batch_size=32 , epochs=5)

In [0]:
score_4 = model_4.evaluate(x_test, y_test)
print(f'測試資料的 loss = {score_4[0]}')
print(f'測試資料正確率 = {score_4[1]}')

### 四組模型對於訓練資料的精準度達到96\~97%左右，但對於測試資料僅於84\~85%的正確率，模型過度擬合訓練資料。  
### 嘗試使用dropout移除神經網路中的輸入，防止過度擬合的狀況發生。

### model5

In [0]:
from keras.layers import Dropout

In [0]:
N5 = 500
K5 = 100

model_5 = Sequential()
model_5.add(Embedding(10000,N5))
model_5.add(LSTM(K5))
model_5.add(Dropout(0.5))
model_5.add(Dense(1, activation='sigmoid'))
model_5.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_5.summary()

In [0]:
(4 * (N5 + K5) + 4) * K5

In [0]:
model_5.fit(x_train, y_train , batch_size=32 , epochs=5)

In [0]:
score_5 = model_5.evaluate(x_test, y_test)
print(f'測試資料的 loss = {score_5[0]}')
print(f'測試資料正確率 = {score_5[1]}')