In [27]:
import os
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub

from pprint import pprint

# Sentiment Analysis on the IMDB Reviews Dataset

An example of a word based RNN

In [3]:
# Getting the data

raw_train_set, raw_valid_set, raw_test_set = tfds.load(
  name="imdb_reviews",
  split=["train[:90%]", "train[90%:]", "test"],
  as_supervised=True
)

tf.random.set_seed(42)
train_set = raw_train_set.shuffle(5000, seed=42).batch(32).prefetch(1)
valid_set = raw_valid_set.batch(32).prefetch(1)
test_set = raw_test_set.batch(32).prefetch(1)


2023-11-07 08:37:51.446449: W tensorflow/tsl/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata.google.internal".


[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /Users/adamscarlat/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Size...: 100%|██████████| 80/80 [00:20<00:00,  3.88 MiB/s]rl]
Dl Completed...: 100%|██████████| 1/1 [00:20<00:00, 20.62s/ url]
                                                                        

[1mDataset imdb_reviews downloaded and prepared to /Users/adamscarlat/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m




In [8]:
# Inspecting the data

for review, label in raw_train_set.take(2):
  pprint (review.numpy().decode("utf-8"))
  print ("Label: ", label.numpy())

("This was an absolutely terrible movie. Don't be lured in by Christopher "
 'Walken or Michael Ironside. Both are great actors, but this must simply be '
 'their worst role in history. Even their great acting could not redeem this '
 "movie's ridiculous storyline. This movie is an early nineties US propaganda "
 'piece. The most pathetic scenes were those when the Columbian rebels were '
 'making their cases for revolutions. Maria Conchita Alonso appeared phony, '
 'and her pseudo-love affair with Walken was nothing but a pathetic emotional '
 'plug in a movie that was devoid of any real meaning. I am disappointed that '
 "there are movies like this, ruining actor's like Christopher Walken's good "
 'name. I could barely sit through it.')
Label:  0
('I have been known to fall asleep during films, but this is usually due to a '
 'combination of things including, really tired, being warm and comfortable on '
 'the sette and having just eaten a lot. However on this occasion I fell '
 'as

2023-11-07 08:47:35.332253: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [None]:
# Vectorizing the text into words

vocab_size = 1000
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)

# We only want to adapt the vectorizer on the reviews (not the labels)
text_vec_layer.adapt(train_set.map(lambda reviews, labels: reviews))

embed_size = 128
model = tf.keras.Sequential([
  text_vec_layer,
  # This embedding matrix has a row per token and 'embed_size' columns. This way each
  # token gets mapped to a vector of size 'embed_size'.
  tf.keras.layers.Embedding(vocab_size, embed_size),
  tf.keras.layers.GRU(128),
  # 1 neuron for classifying positive/negative sentiments
  tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])
history = model.fit(train_set, validation_data=valid_set, epochs=2)


### Masking

Training this model (on Kaggle) yields disappointing results. The accuracy does not increase above 50%. The main reason
for it is that the reviews are of different lengths and the TextVectorization layer uses the longest sequence as a 
limit and pads other sequences with zeros to match the longest sequence's length. This is causing the GRU to learn bad 
patterns as the padding affect the learning a lot.

One way to deal with it is using `masking` - making the model ignore padding tokens

In [None]:
# Vectorizing the text into words

# The only difference between this setup and the previous one is the `mask_zero=True` parameter to the embedding layer!
# Training this model on Kaggle yields results with validation accuracy over 85%.

vocab_size = 1000
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)

# We only want to adapt the vectorizer on the reviews (not the labels)
text_vec_layer.adapt(train_set.map(lambda reviews, labels: reviews))

embed_size = 128
model = tf.keras.Sequential([
  text_vec_layer,
  tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),
  tf.keras.layers.GRU(128),
  tf.keras.layers.Dense(1, activation="sigmoid")
])

model_ckpt = tf.keras.callbacks.ModelCheckpoint("imdb_sentiment_model", monitor="val_accuracy", save_best_only=True)
model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])
history = model.fit(train_set, validation_data=valid_set, epochs=10, callbacks=[ model_ckpt ])


Training on Kaggle...

In [12]:
loaded_model = tf.keras.models.load_model("models/imdb_sentiment_model")

2023-11-08 08:19:10.786856: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-11-08 08:19:10.888661: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-11-08 08:19:10.900962: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond' has 4 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-11-08 08:19:10.987421: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-11-08 08:19:11.033969: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _ou

In [26]:
print (loaded_model.predict(["bro that movie sucked real bad. stroy line was awful"]))
print (loaded_model.predict(["bro that movie was amazing. stroy line was superb"]))

[[0.03552284]]
[[0.91320604]]


## Reusing Pretrained LLM Parts in Our Models

Instead of training an embedding layer from scratch or reusing a pretrained embedded layer, we reuse a part of a pretrained language
  model.
  - These pretrained LLM parts take into account context and embeddings.
  - We can further fine-tune them when we add them to our models.

After training this model, we reach an accuracy of over 90% since context is taken into account. This helps with reviews such as 
"this movie was not as great as I hoped" - notice that the presence of "great" is actually negative.

In [None]:
# To avoid having the model download over and over 
os.environ["TFHUB_CACHE_DIR"] = "my_tfhub_cache"

model = tf.keras.Sequential([
  hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", trainable=True, dtype=tf.string, input_shape=[]),
  tf.keras.layers.Dense(64, activation="relu"),
  tf.keras.layers.Dense(1, activation="sigmoid"),
])
model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

model_ckpt = tf.keras.callbacks.ModelCheckpoint("models/imdb_sentiment_model_use", monitor="val_accuracy", save_best_only=True)
history = model.fit(train_set, validation_data=valid_set, epochs=10, callbacks=[ model_ckpt ])

In [30]:
loaded_model = tf.keras.models.load_model("models/imdb_sentiment_model_use")

In [42]:
# Doesn't seem to catch the inverse statements from context (e.g. "not good", "not great") as advertised 

print (loaded_model.predict(["bro this movie is not bad. I actually thought that it was not as boring as other ones"]))
print (loaded_model.predict(["this movie was not as good as I hoped. I thought that it was not an interesting movie"]))

[[0.08436868]]
[[0.73675543]]
