<a href="https://colab.research.google.com/github/amanalok/deep-learning/blob/main/IMDB_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds

In [4]:
dataset, info = tfds.load('imdb_reviews', as_supervised=True, with_info=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete008KWS/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete008KWS/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete008KWS/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [5]:
train_size = info.splits['train'].num_examples
test_size = info.splits['test'].num_examples
print(train_size, test_size)

25000 25000


In [6]:
for X_batch, y_batch in dataset['train'].batch(2).take(1):
  for review, label in zip(X_batch, y_batch.numpy()):
    print('Review: ', review)
    print('Label: ', label, '= Positve' if label == 1 else '= Negative')

Review:  tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string)
Label:  0 = Negative
Review:  tf.Tensor(b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However o

In [7]:
def preprocess(X_batch, y_batch):
  X_batch = tf.strings.substr(X_batch, 0, 300)
  X_batch = tf.strings.regex_replace(X_batch, b'<br\s*/?', b' ')
  X_batch = tf.strings.regex_replace(X_batch, b'[^a-zA-Z]', b' ')
  X_batch = tf.strings.split(X_batch)
  return X_batch.to_tensor(default_value='<pad>'), y_batch

In [8]:
def preprocess1(X_batch, y_batch):
  X_batch = tf.strings.substr(X_batch, 0, 300)
  X_batch = tf.strings.regex_replace(X_batch, b'<br\\s*/?', b' ')
  X_batch = tf.strings.regex_replace(X_batch, b'[^a-zA-Z]', b' ')
  X_batch = tf.strings.split(X_batch)
  return X_batch.to_tensor(default_value='<pad>'), y_batch

In [9]:
from collections import Counter
vocabulary = Counter()

In [10]:
for X_batch, y_batch in dataset['train'].batch(2).take(2).map(preprocess):
    for review in X_batch:
      print(list(review.numpy()))

[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie', b'Don', b't', b'be', b'lured', b'in', b'by', b'Christopher', b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are', b'great', b'actors', b'but', b'this', b'must', b'simply', b'be', b'their', b'worst', b'role', b'in', b'history', b'Even', b'their', b'great', b'acting', b'could', b'not', b'redeem', b'this', b'movie', b's', b'ridiculous', b'storyline', b'This', b'movie', b'is', b'an', b'early', b'nineties', b'US', b'propaganda', b'pi', b'<pad>']
[b'I', b'have', b'been', b'known', b'to', b'fall', b'asleep', b'during', b'films', b'but', b'this', b'is', b'usually', b'due', b'to', b'a', b'combination', b'of', b'things', b'including', b'really', b'tired', b'being', b'warm', b'and', b'comfortable', b'on', b'the', b'sette', b'and', b'having', b'just', b'eaten', b'a', b'lot', b'However', b'on', b'this', b'occasion', b'I', b'fell', b'asleep', b'because', b'the', b'film', b'was', b'rubbish', b'The', b'plot', b'development', b'was',

In [11]:
for X_batch, y_batch in dataset['train'].batch(32).map(preprocess):
    for review in X_batch:
      vocabulary.update(review.numpy())

In [12]:
vocabulary.most_common()[:10]

[(b'<pad>', 224647),
 (b'the', 61156),
 (b'a', 38569),
 (b'of', 33984),
 (b'and', 33432),
 (b'I', 30330),
 (b'to', 27707),
 (b'is', 25719),
 (b'it', 20209),
 (b'in', 18973)]

In [13]:
vocab_size = 10000
truncated_vocab = [word for word, count in vocabulary.most_common()[:vocab_size]]

In [14]:
truncated_vocab[:10]

[b'<pad>', b'the', b'a', b'of', b'and', b'I', b'to', b'is', b'it', b'in']

In [15]:
words = tf.constant(truncated_vocab)

In [16]:
word_ids = tf.range(len(truncated_vocab), dtype=tf.int64)

In [17]:
word_ids

<tf.Tensor: shape=(10000,), dtype=int64, numpy=array([   0,    1,    2, ..., 9997, 9998, 9999])>

In [18]:
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)

In [19]:
num_oov_bucket = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_bucket)

In [20]:
table.lookup(tf.constant('hi there my name is aman'.split()))

<tf.Tensor: shape=(6,), dtype=int64, numpy=array([ 2369,    72,    43,   403,     7, 10866])>

In [21]:
def encode_words(X_batch, y_batch):
  return table.lookup(X_batch), y_batch

In [22]:
train_set = dataset['train'].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [24]:
embed_size = 128

In [25]:
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size+num_oov_bucket, embed_size, input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation='sigmoid')
])

In [26]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [27]:
model.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f1c22615410>