In [1]:
!pip install keras
import keras
from tensorflow.keras.layers import Dense
from keras.layers import Activation, Dense
from keras.layers import Embedding
from keras.datasets import imdb
from keras_preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Bidirectional, SpatialDropout1D, LSTM, GRU, SimpleRNN
from keras.callbacks import ModelCheckpoint
import os
from sklearn.metrics import roc_auc_score, roc_curve
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext tensorboard

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
#Output directory
output_dir = 'model_output/rnn'

#Training:
epochs = 4
batch_size = 128

#Embedding:
n_dim = 64
n_unique_words = 5000
n_words_to_skip = 50 
max_review_length = 100
pad_type = trunc_type = 'pre'
drop_embed=0.2

#Dense 
#n_dense = 64
#dropout = 0.5

In [3]:
#RNN
n_rnn = 256
drop_rnn = 0.2

#LSTM/Bi
n_lstm = 256
drop_lstm = 0.2

#Stacked
n_lstm_1 = 64
n_lstm_2 = 64
drop2_lstm = 0.2

#GRU
n_gru = 256
drop_gru = 0.2

In [4]:
#Load IMDB Data
#This is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment
#(positive/negative). Reviews have been preprocessed, and each review is
#encoded as a list of word indexes (integers).
#For convenience, words are indexed by overall frequency in the dataset,
#so that for instance the integer "3" encodes the 3rd most frequent word in
#the data. This allows for quick filtering operations such as:
#"only consider the top 10,000 most
#common words, but eliminate the top 20 most common words".
#As a convention, "0" does not stand for a specific word, but instead is used
#to encode any unknown word.
(x_train, y_train), (x_valid, y_valid) = imdb.load_data(num_words=n_unique_words, 
                                                       skip_top=n_words_to_skip)
x_train = pad_sequences(x_train, maxlen=max_review_length, padding=pad_type, truncating=trunc_type, value=0)
x_valid = pad_sequences(x_valid, maxlen=max_review_length, padding=pad_type, truncating=trunc_type, value=0)

In [11]:
#RNN
model= Sequential()
model.add(Embedding(n_unique_words, n_dim, input_length=max_review_length))
model.add(SpatialDropout1D(drop_embed))
model.add(SimpleRNN(n_rnn, dropout=drop_rnn))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 100, 64)           320000    
                                                                 
 spatial_dropout1d_5 (Spatia  (None, 100, 64)          0         
 lDropout1D)                                                     
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 256)               82176     
                                                                 
 dense_3 (Dense)             (None, 1)                 257       
                                                                 
Total params: 402,433
Trainable params: 402,433
Non-trainable params: 0
_________________________________________________________________


In [12]:
#LSTM
model= Sequential()
model.add(Embedding(n_unique_words, n_dim, input_length=max_review_length))
model.add(SpatialDropout1D(drop_embed))
model.add(LSTM(n_lstm, dropout=drop_lstm))
model.add(Dense(1, activation='sigmoid'))
model.summary()


Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 100, 64)           320000    
                                                                 
 spatial_dropout1d_6 (Spatia  (None, 100, 64)          0         
 lDropout1D)                                                     
                                                                 
 lstm_2 (LSTM)               (None, 256)               328704    
                                                                 
 dense_4 (Dense)             (None, 1)                 257       
                                                                 
Total params: 648,961
Trainable params: 648,961
Non-trainable params: 0
_________________________________________________________________


In [13]:
#Bidirectional LSTM
model= Sequential()
model.add(Embedding(n_unique_words, n_dim, input_length=max_review_length))
model.add(SpatialDropout1D(drop_embed))
model.add(Bidirectional(LSTM(n_lstm, dropout=drop_lstm)))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 100, 64)           320000    
                                                                 
 spatial_dropout1d_7 (Spatia  (None, 100, 64)          0         
 lDropout1D)                                                     
                                                                 
 bidirectional (Bidirectiona  (None, 512)              657408    
 l)                                                              
                                                                 
 dense_5 (Dense)             (None, 1)                 513       
                                                                 
Total params: 977,921
Trainable params: 977,921
Non-trainable params: 0
_________________________________________________________________


In [15]:
#GRU
model= Sequential()
model.add(Embedding(n_unique_words, n_dim, input_length=max_review_length))
model.add(SpatialDropout1D(drop_embed))
model.add(GRU(n_gru, dropout=drop_gru))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 100, 64)           320000    
                                                                 
 spatial_dropout1d_8 (Spatia  (None, 100, 64)          0         
 lDropout1D)                                                     
                                                                 
 gru (GRU)                   (None, 256)               247296    
                                                                 
 dense_6 (Dense)             (None, 1)                 257       
                                                                 
Total params: 567,553
Trainable params: 567,553
Non-trainable params: 0
_________________________________________________________________


In [16]:
#Stacked 
model= Sequential()
model.add(Embedding(n_unique_words, n_dim, input_length=max_review_length))
model.add(SpatialDropout1D(drop_embed))
model.add(Bidirectional(LSTM(n_lstm_1, dropout=drop_lstm,
                             return_sequences=True)))
model.add(Bidirectional(LSTM(n_lstm_2, dropout=drop_lstm)))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 100, 64)           320000    
                                                                 
 spatial_dropout1d_9 (Spatia  (None, 100, 64)          0         
 lDropout1D)                                                     
                                                                 
 bidirectional_1 (Bidirectio  (None, 100, 128)         66048     
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense_7 (Dense)             (None, 1)                 129       
                                                      

In [18]:
#Model compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
modelcheckpoint = ModelCheckpoint(filepath=output_dir+"/weights.{epoch:02d}.hdf5")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#Model train
model.fit(x_train, y_train,
          batch_size=batch_size, epochs=epochs, verbose=1,
          validation_data = (x_valid, y_valid),
          callbacks=[modelcheckpoint])
#Evaluate 
model.load_weights(output_dir+"/weights.02.hdf5") #Not zero-indexed
y_hat = model.predict(x_valid)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


To further refine the model, there are several tasks that can be executed below.


In [None]:
#First six reviews
x_train[0:6]


In [None]:
#First six review lengths
#We will pad these later and make them similar lengths
for x in x_train[0:6]:
  print(len(x))

In [None]:
y_train[0:6]

In [None]:
len(x_train), len(x_valid)

In [None]:
#Create index of words 
word_index = keras.datasets.imdb.get_word_index()
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["PAD"] = 0
word_index["START"] = 1
word_index["UNK"] = 2

In [None]:
index_word = {v:k for k,v in word_index.items()}

In [None]:
x_train[0]

In [None]:
#First film review in the data set
''.join(index_word[id] for id in x_train[0])

In [None]:
#We need the full review as a series of string characters
(all_x_train,_),(all_x_valid,_) = imdb.load_data()

In [None]:
' '.join(index_word[id] for id in all_x_train[0]) #The apostraphes at the beginning of this code line denote spacing between words

In [None]:
#We can pad and truncate text here
x_train = pad_sequences(x_train, maxlen=max_review_length,
                        padding=pad_type, truncating=trunc_type, value=0)
x_valid = pad_sequences(x_valid, maxlen=max_review_length,
                        padding=pad_type, truncating=trunc_type, value=0)

In [None]:
#There are a lot of zeroes here, so padding has been applied
x_train[0:6]

In [None]:
for x in x_train[0:6]:
  print(len(x))

In [None]:
' '.join(index_word[id] for id in x_train[5])

In [None]:
#64*5000=320,000 = This also equals parameters above
n_dim, n_unique_words, n_dim*n_unique_words #embedding layer

In [None]:
#100*64 = 6400
max_review_length, n_dim, n_dim*max_review_length # flatten

In [None]:
#This is the 409664
#Each of 64 neurons in the dense layer receives input from 6400 values = 64*6400 = 409664 for the weights, 
n_dense, n_dim*max_review_length*n_dense + n_dense #dense: weights + biases

In [None]:
n_dense + 1 #output
#Output layer has 64 weights, 1 (bias) for output

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
modelcheckpoint = ModelCheckpoint(filepath=output_dir+"/weights.{epoch:02d}.hdf5")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
#train
model.fit(x_train, y_train,
          batch_size=batch_size, epochs=epochs, verbose=1,
          validation_data = (x_valid, y_valid),
          callbacks=[modelcheckpoint])

In [None]:
#Evaluate 
model.load_weights(output_dir+"/weights.02.hdf5") #Not zero-indexed
y_hat = model.predict(x_valid)


In [None]:
y_valid[0]

In [None]:
plt.hist(y_hat)
_ = plt.axvline(x=0.5, color='orange')

In [None]:
pct_auc = roc_auc_score(y_valid, y_hat)*100
"{:0.2f}".format(pct_auc)

In [None]:
float_y_hat = []
for y in y_hat:
  float_y_hat.append(y[0])

In [None]:
' '.join(index_word[id] for id in all_x_valid[0])