In [116]:
import keras
from keras.datasets import imdb

### Load IMDB Movie Review Data

In [117]:
詞彙量 = 4096
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = 詞彙量)
print(f'Training samples: {len(x_train)}, Testing samples: {len(x_test)}')

Training samples: 25000, Testing samples: 25000


### Inspect both x_train and y_train random data

In [118]:
import random
random_num = random.randint(0,25000)
print(f'f_train at index {random_num}') # array of random words
print(x_train[random_num])

f_train at index 5522
[1, 14, 365, 16, 93, 11, 4, 655, 3486, 5, 739, 8, 30, 4, 86, 622, 589, 8, 276, 3752, 313, 23, 4065, 57, 22, 310, 7, 4, 667, 16, 126, 3367, 12, 9, 2, 1026, 11, 2, 15, 165, 2895, 17, 2, 8, 257, 85, 38, 15, 12, 62, 28, 77, 55, 254, 8, 2, 12, 180, 4, 667, 16, 4, 64, 589, 34, 2, 8, 97, 6, 1375, 3870, 31, 7, 107, 11, 4, 157, 6, 255, 2, 2, 2, 9, 2841, 34, 41, 2965, 5, 1697, 37, 11, 801, 2, 405, 2, 41, 59, 9, 2, 21, 36, 528, 376, 41, 233, 44, 41, 2, 303, 75, 79, 574, 19, 4, 2, 2688, 2, 2, 5, 27, 322, 761, 2, 2, 2, 2, 2, 9, 260, 35, 3437, 878, 58, 2795, 41, 1955, 113, 5, 4, 2, 3866, 7, 4, 223, 2, 2, 826, 2, 75, 82, 26, 574, 19, 4, 1729, 7, 745, 2, 2, 1432, 11, 269, 8, 1176, 6, 196, 1309, 46, 3519, 2, 420, 2, 1964, 2, 63, 316, 60, 2, 3242, 308, 2, 256, 34, 2, 2, 2, 9, 24, 290, 4, 781, 10, 10, 2, 69, 77, 6, 1164, 2422, 5, 95, 6, 2, 2422, 159, 29, 1040, 1219, 1861, 19, 4, 2, 7, 2339, 420, 11, 2, 2, 2, 610, 3280, 33, 4, 1164, 443, 2, 301, 12, 16, 6, 1995, 11, 2402, 1009, 5, 2,

### The value of y_train assigns 0 to words with a negative connotation and 1 to words with a positive connotation.

In [119]:
print(f'y_train at index {random_num}')
print(y_train[random_num])

y_train at index 5522
1


### Maximum allowance of words per IMDB review

In [120]:
max_words = 512

In [121]:
from keras.preprocessing import sequence
x_train = sequence.pad_sequences(x_train, maxlen=max_words)
x_test = sequence.pad_sequences(x_test, maxlen=max_words)

### First layer should be embedding because text is being analysed.

In [122]:
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras import Sequential
embedding_size = 32
model = Sequential()
model.add(Embedding(詞彙量, embedding_size, input_length=max_words))

In [123]:
from tensorflow.keras.layers import Dropout
model.add(Dropout(0.5))

In [124]:
from tensorflow.keras.layers import LSTM, Dense
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [125]:
print(model.summary())

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 512, 32)           131072    
_________________________________________________________________
dropout_6 (Dropout)          (None, 512, 32)           0         
_________________________________________________________________
unified_lstm_6 (UnifiedLSTM) (None, 128)               82432     
_________________________________________________________________
dropout_7 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 129       
Total params: 213,633
Trainable params: 213,633
Non-trainable params: 0
_________________________________________________________________
None


### Compile the model

In [126]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [127]:
batch_size = 64
x_batch, y_batch = x_train[:batch_size], y_train[:batch_size]
x_train2, y_train2 = x_train[batch_size:], y_train[batch_size:]

In [128]:
num_epochs = 3
model.fit(x_train2, y_train2, validation_data = (x_batch, y_batch),
         batch_size=batch_size, epochs=num_epochs)

Train on 24936 samples, validate on 64 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0xb38ec8fd0>

### Test a tweaked version of the above model, with a higher vocabulary base, lower dropout value, and more epochs

In [129]:
詞彙量 = 8192
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = 詞彙量)

In [130]:
max_words = 1024

In [131]:
x_train = sequence.pad_sequences(x_train, maxlen=max_words)
x_test = sequence.pad_sequences(x_test, maxlen=max_words)

In [132]:
embedding_size = 32
model2 = Sequential()
model2.add(Embedding(詞彙量, embedding_size, input_length=max_words))
model2.add(Dropout(0.2))
model2.add(LSTM(128))
model2.add(Dropout(0.2))
model2.add(Dense(1, activation='sigmoid'))
print(model2.summary())

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 1024, 32)          262144    
_________________________________________________________________
dropout_8 (Dropout)          (None, 1024, 32)          0         
_________________________________________________________________
unified_lstm_7 (UnifiedLSTM) (None, 128)               82432     
_________________________________________________________________
dropout_9 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 129       
Total params: 344,705
Trainable params: 344,705
Non-trainable params: 0
_________________________________________________________________
None


In [133]:
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [134]:
batch_size = 64
x_batch, y_batch = x_train[:batch_size], y_train[:batch_size]
x_train2, y_train2 = x_train[batch_size:], y_train[batch_size:]

num_epochs = 10
model2.fit(x_train2, y_train2, validation_data = (x_batch, y_batch),
         batch_size=batch_size, epochs=num_epochs)

Train on 24936 samples, validate on 64 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0xb2cf48710>

### The goal of this recurrent neural network is to reach an accuracy rating of 87%. This accuracy seems to be reached after the second epoch. 

#### Some Cool Reference Material
https://towardsdatascience.com/a-beginners-guide-on-sentiment-analysis-with-rnn-9e100627c02e




Coding with Pedro