In [17]:
!git add C4_NLP_Sentiment_Analysis_RNN.ipynb
!git commit -m "initial check in"
!git push

The file will have its original line endings in your working directory


[master f923b5c] initial check in
 1 file changed, 249 insertions(+)
 create mode 100644 2_DNN_TF/C4_NLP_Sentiment_Analysis_RNN.ipynb


To https://github.com/auslei/python.git
   e38a8c7..f923b5c  master -> master


In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

dataset, info = tfds.load("imdb_reviews", as_supervised = True, with_info = True)
train_size = info.splits["train"].num_examples

In [2]:
#https://www.tensorflow.org/datasets/api_docs/python/tfds/core/DatasetInfo
for t in dataset['test']: print(t);break
dataset.keys(), info.description, info.features

(<tf.Tensor: shape=(), dtype=string, numpy=b"There are films that make careers. For George Romero, it was NIGHT OF THE LIVING DEAD; for Kevin Smith, CLERKS; for Robert Rodriguez, EL MARIACHI. Add to that list Onur Tukel's absolutely amazing DING-A-LING-LESS. Flawless film-making, and as assured and as professional as any of the aforementioned movies. I haven't laughed this hard since I saw THE FULL MONTY. (And, even then, I don't think I laughed quite this hard... So to speak.) Tukel's talent is considerable: DING-A-LING-LESS is so chock full of double entendres that one would have to sit down with a copy of this script and do a line-by-line examination of it to fully appreciate the, uh, breadth and width of it. Every shot is beautifully composed (a clear sign of a sure-handed director), and the performances all around are solid (there's none of the over-the-top scenery chewing one might've expected from a film like this). DING-A-LING-LESS is a film whose time has come.">, <tf.Tensor: 

(dict_keys(['test', 'train', 'unsupervised']),
 'Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
 FeaturesDict({
     'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
     'text': Text(shape=(), dtype=tf.string),
 }))

In [3]:
# data preprocessing
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300) # only take the first 300 characters
    X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ") # remove newlines and brs
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ") # keep only letters
    X_batch = tf.strings.split(X_batch) #break into words
    return X_batch.to_tensor(default_value = b"<pad>"), y_batch
    

In [4]:
# construct vocabulary
from collections import Counter
vocabulary = Counter()
for X_batch, y_batch in dataset['train'].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [5]:
vocabulary.most_common()[:3] # top 3 words and their respective count (should really remove stop words)

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]

In [6]:
# generate list of words and keep within 10k. this is to construct the word ids
vocab_size = 10000
trucated_vocabulary = [word for word, count in vocabulary.most_common()[:vocab_size]]

In [7]:
words = tf.constant(trucated_vocabulary)
word_ids = tf.range(len(trucated_vocabulary), dtype = tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets) #create a look up table

In [9]:
table.lookup(tf.constant([b"this is thisisnotaword".split()])) #note that the not word is added in the OOV buckets

<tf.Tensor: shape=(1, 3), dtype=int64, numpy=array([[    9,     7, 10893]], dtype=int64)>

In [11]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

train_set = dataset["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [12]:
for s in train_set: print(s); break

(<tf.Tensor: shape=(32, 60), dtype=int64, numpy=
array([[  22,   11,   28, ...,    0,    0,    0],
       [   6,   21,   70, ...,    0,    0,    0],
       [4099, 6881,    1, ...,    0,    0,    0],
       ...,
       [  22,   12,  118, ...,  331, 1047,    0],
       [1757, 4101,  451, ...,    0,    0,    0],
       [3365, 4392,    6, ...,    0,    0,    0]], dtype=int64)>, <tf.Tensor: shape=(32,), dtype=int64, numpy=
array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)>)


In [16]:
from tensorflow import keras

embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape = [None]),
    # creates an embedding layer witha embeding size predefined
    keras.layers.GRU(128, return_sequences = True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation = "sigmoid")
])

model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])
history = model.fit(train_set, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         1408000   
_________________________________________________________________
gru_2 (GRU)                  (None, None, 128)         99072     
_________________________________________________________________
gru_3 (GRU)                  (None, 128)               99072     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 1,606,273
Trainable params: 1,606,273
Non-trainable params: 0
_________________________________________________________________


## Masking

The previous exmaple uses padding which the model will learn to ignore. The is the information we arleady know. To reduce the computational cost, we can use masking to ignore the word/character for all downstream layers.

In [5]:
import tensorflow as tf
from tensorflow import keras

K = keras.backend

inputs = tf.constant([1,0,0,0])
mask = tf.constant([1,1,1,0])

K.not_equal(inputs, mask) # returns boolean tensor (T where not equal, False where equal)

<tf.Tensor: shape=(4,), dtype=bool, numpy=array([False,  True,  True, False])>

In [None]:
K = keras.backend
inputs = keras.layers.Input(shape=[None])
mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)
z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)
z = keras.layers.GRU(128, return_sequences=True)(z, mask=mask)
z = keras.layers.GRU(128)(z, mask=mask)
outputs = keras.layers.Dense(1, activation="sigmoid")(z)
model = keras.Model(inputs=[inputs], outputs=[outputs])