In [1]:
import os
# ROOT = "/content/drive/MyDrive/001_SUNNY_BHAVEEN_CHANDRA/001_My_Classes/DLCVNLP/Live Lectures Material DLCVNLP April 17th batch/ALL NOTES/Feb 06"

# os.chdir(ROOT)

In [2]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")

In [3]:
def get_plot(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history[f"val_{metric}"])
  plt.xlabel("Epochs ->>")
  plt.ylabel(f"{metric} ->>")
  plt.legend([metric, f"val_{metric}"])

In [4]:
dataset_name = "imdb_reviews"

dataset, info = tfds.load(dataset_name, with_info=True, as_supervised=True)

train_ds, test_ds = dataset["train"], dataset["test"]

In [5]:
# check the first batch -

for example, label in train_ds.take(1):
  print(f"test_example: \n{example.numpy()} \n")
  print(f"label: {label.numpy()}")

test_example: 
b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it." 

label: 0


In [6]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset.
    This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='C:\\Users\\sunny\\tensorflow_datasets\\imdb_reviews\\plain_text\\1.0.0',
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        'train':

In [7]:
class Config:
  BUFFER_SIZE = 10000
  BATCH_SIZE = 64
  VOCAB_SIZE = 1000
  OUTPUT_DIM = 64
  EPOCHS = 10
  BASE_LOG_DIR = "base_log_dir"
  TRAINED_MODEL_DIR = os.path.join(BASE_LOG_DIR, "models")
  CKPT_DIR = os.path.join(BASE_LOG_DIR, "checkpoints")
  TB_ROOT_LOG_DIR = os.path.join(BASE_LOG_DIR, "tb_log_dir")

In [8]:
config = Config()

In [9]:
config.BUFFER_SIZE

10000

In [10]:
# shuffling and batching of the train dataset

train_ds = train_ds.shuffle(config.BUFFER_SIZE).batch(config.BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.batch(config.BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [11]:
for example, label in train_ds.take(1):
  print(f"test_example: \n{example.numpy()[:3]} \n")
  print(f"label: {label.numpy()[:3]}")

test_example: 
[b"Any Way the Wind Blows is Tom Barmans (who is also know as front man of the rock formation 'dEUS') debut movie. Entirely shot in Antwerp (Belgium), the movie starts on a sunny friday morning and skips rather superficially between the events that fill the day of a dozen of main characters. When the movie ends, you have a lot of stuff to think about, because most of the different story-lines are left wide open.<br /><br />The movie has a (purely instrumental) sound track that will rock your socks off. In most scenes, the music truly enhances the general atmosphere and feel, really making the movie hallucinating to watch at certain points of time. The main scene in the film, the party, is very well shot.<br /><br />The director didn't hesitate to use video clip techniques, having his main characters dancing on one of the best sound tracks I've heard lately.<br /><br />The screenplay is great stuff. Camera angles and colors are very well chosen. The 'costumes' are very ho

In [12]:
# text encoder 

# keep the data as it is without custom standardisation
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=config.VOCAB_SIZE)

# adapt to change only on text data
encoder.adapt(train_ds.map(lambda text, label: text))


## text, label ## <--- train data is arranged

In [13]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but'],
      dtype='<U14')

In [14]:
print("text examples\n", example.numpy()[:3])

text examples
 [b"Any Way the Wind Blows is Tom Barmans (who is also know as front man of the rock formation 'dEUS') debut movie. Entirely shot in Antwerp (Belgium), the movie starts on a sunny friday morning and skips rather superficially between the events that fill the day of a dozen of main characters. When the movie ends, you have a lot of stuff to think about, because most of the different story-lines are left wide open.<br /><br />The movie has a (purely instrumental) sound track that will rock your socks off. In most scenes, the music truly enhances the general atmosphere and feel, really making the movie hallucinating to watch at certain points of time. The main scene in the film, the party, is very well shot.<br /><br />The director didn't hesitate to use video clip techniques, having his main characters dancing on one of the best sound tracks I've heard lately.<br /><br />The screenplay is great stuff. Camera angles and colors are very well chosen. The 'costumes' are very ho

In [15]:
encoder_example = encoder(example)[:3].numpy()
encoder_example

array([[99, 97,  2, ...,  0,  0,  0],
       [49, 34,  1, ...,  0,  0,  0],
       [11,  7, 29, ...,  0,  0,  0]], dtype=int64)

In [16]:
# model for BiRNN

embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(encoder.get_vocabulary()),
    output_dim=config.OUTPUT_DIM,
    mask_zero=True # use masking to handle the variable seq lengths example <sos>, <pad>, <eod>
)

In [17]:
LAYERS = [
          encoder,
          embedding_layer,
          tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
          tf.keras.layers.Dense(64, activation="relu"),
          tf.keras.layers.Dense(1)
]

In [18]:
model = tf.keras.Sequential(LAYERS)

In [19]:
[layer.supports_masking for layer in model.layers]

[False, True, True, True, True]

In [20]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=["accuracy"])

In [21]:
import time

def callbacks(base_dir="."):
  # tb callbacks

  base_log_dir = config.TB_ROOT_LOG_DIR
  unqiue_log = time.asctime().replace(" ","_").replace(":", "")
  tb_log_dir = os.path.join(base_log_dir, unqiue_log)
  os.makedirs(tb_log_dir, exist_ok=True)

  tb_cb = tf.keras.callbacks.TensorBoard(log_dir=tb_log_dir)

  # ckpt callbacks
  ckpt_file = os.path.join(config.CKPT_DIR, "model")
  os.makedirs(config.CKPT_DIR, exist_ok=True)

  ckpt_cb = tf.keras.callbacks.ModelCheckpoint(
      filepath = ckpt_file, 
      save_best_only = True)
  
  callback_list = [tb_cb, ckpt_cb]

  return callback_list

In [22]:
callback_list = callbacks()

In [23]:
history = model.fit(train_ds, 
                    epochs=config.EPOCHS, 
                    validation_data=test_ds, 
                    validation_steps=30, 
                    callbacks=callback_list)

Epoch 1/10



INFO:tensorflow:Assets written to: base_log_dir\checkpoints\model\assets


INFO:tensorflow:Assets written to: base_log_dir\checkpoints\model\assets


Epoch 2/10



INFO:tensorflow:Assets written to: base_log_dir\checkpoints\model\assets


INFO:tensorflow:Assets written to: base_log_dir\checkpoints\model\assets


Epoch 3/10



INFO:tensorflow:Assets written to: base_log_dir\checkpoints\model\assets


INFO:tensorflow:Assets written to: base_log_dir\checkpoints\model\assets


Epoch 4/10



INFO:tensorflow:Assets written to: base_log_dir\checkpoints\model\assets


INFO:tensorflow:Assets written to: base_log_dir\checkpoints\model\assets


Epoch 5/10
 22/391 [>.............................] - ETA: 58s - loss: 0.2981 - accuracy: 0.8679

KeyboardInterrupt: 

In [24]:
test_loss, test_acc = model.evaluate(test_ds)

test_loss, test_acc



(0.3503561019897461, 0.8428000211715698)

In [25]:
get_plot(history=history, metric="accuracy")

NameError: name 'history' is not defined

In [None]:
get_plot(history=history, metric="loss")

In [None]:
# %load_ext tensorboard

# %tensorboard --logdir base_log_dir/tb_log_dir

In [29]:
sample_text = ("The movie was cool. The animation and the graphics were of this world. I would recommend this movie")

In [30]:
def get_score_and_sentiment(model, sample_text):
    pred = model.predict(np.array([sample_text]))
    score = pred[0][0]
    if score >=0:
        print(f"result: positive sentiment with score: {score}")
    else:
        print(f"result: negitive sentiment with score: {score}")

In [31]:
get_score_and_sentiment(model, sample_text)

result: positive sentiment with score: 0.6904364824295044


In [32]:
sample_text = ("The movie was crap. The animation and the graphics were worst. I would never recommend this movie")

In [33]:
get_score_and_sentiment(model, sample_text)

result: negitive sentiment with score: -1.434403896331787


In [35]:
# model.save("birnn.h5")