<a href="https://colab.research.google.com/github/andysingal/transfer-learning/blob/main/NLP_sentimentanalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi




In [None]:
from __future__ import absolute_import,division,print_function,unicode_literals
try:
  !pip uninstall tb-nightly tensorboardX tensorboard
  !pip install tf-nightly
except Exception:
     pass
import tensorflow as tf
import pandas as pd
import numpy as np

import os
import datetime
import tensorflow_datasets as tfds

%load_ext tensorboard


In [None]:
import pkg_resources
for entry_point in pkg_resources.iter_entry_points('tensorflow_plugins'):
  print(entry_point.dist)

In [None]:
#!ls -alrt /usr/local/lib/python3.7/dist-packages/tensorboard*
!rm -r  /usr/local/lib/python3.7/dist-packages/tensorboardcolab-0.22.dist-info


In [None]:
print(tf.__version__)

In [None]:
tf.config.experimental.list_physical_devices()

In [None]:
dataset,info = tfds.load('amazon_us_reviews/Personal_Care_Appliances_v1_00', with_info=True)
train_dataset = dataset['train']

In [None]:
info

In [None]:
BUFFER_SIZE=30000
BATCH_SIZE=128

In [None]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE,reshuffle_each_iteration=False)

In [None]:
for reviews in train_dataset.take(2):
  print(reviews)

In [None]:
for reviews in train_dataset.take(10):
   review_text = reviews['data']
   print(review_text.get('review_body').numpy())
   print(review_text.get('star_rating'))
   print(tf.where(review_text.get('star_rating') >3,1,0).numpy())

Building a vocabulary, tokenising and encoding.
First, build a vocabulary by tokenizing the text into a collection of individual unique words. There are a few ways to do this in both TensorFlow and Python. For this tutorial:
Iterate over each example’s numpy value.
Use tfds.features.text.Tokenizer to split it into tokens.
Collect these tokens into a Python set, to remove duplicates.
Get the size of the vocabulary for later use.

In [None]:
tokenizer = tfds.deprecated.text.Tokenizer()

vocabulary_set = set()
for _,reviews in train_dataset.enumerate():
  review_text = reviews['data']
  review_tokens = tokenizer.tokenize(review_text.get('review_body').numpy())
  vocabulary_set.update(review_tokens)

vocab_size = len(vocabulary_set)
vocab_size

Encode examples
Create an encoder by passing the vocabulary_set to tfds.features.text.TokenTextEncoder. The encoder's encode method takes in a string of text and returns a list of integers.

https://alexmoltzau.medium.com/building-a-text-dataset-c5c1481395f4


In [None]:
encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set)

In [None]:
#print(vocabulary_set)

In [None]:
for reviews in train_dataset.take(10):
   review_text = reviews['data']
   print(review_text.get('review_body').numpy())
   encode_example = encoder.encode(review_text.get('review_body').numpy())
   print(encode_example)

In [None]:
for index in encode_example:
  print('{} ----> {}'.format(index,encoder.decode([index])))

In [None]:
def encode(text_sensor,label):
  encoded_text = encoder.encode(text_sensor.numpy())
  label = tf.where(label >3,1,0)
  return encoded_text,label

In [None]:
def encode_map_fn(tensor):
  text = tensor['data'].get('review_body')
  label = tensor['data'].get('star_rating')
  # py_func doesn't set the shape of the returned tensors.
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int32))

  # `tf.data.Datasets` work best if all components have a shape set
  #  so set the shapes manually: 
  encoded_text.set_shape([None])
  label.set_shape([])

  return encoded_text, label

https://notebook.community/tensorflow/docs/site/en/tutorials/load_data/text

In [None]:
all_encoded_data = train_dataset.map(encode_map_fn)

In [None]:
for f0,f1 in all_encoded_data.take(2):
  print(f0)
  print(f1)

https://notebook.community/tensorflow/docs/site/en/tutorials/load_data/text

In [None]:
TAKE_SIZE = 10000
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE)

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE)

Since we have introduced a new token encoding (the zero used for padding), the vocabulary size has increased by one.

In [None]:
vocab_size += 1

BUILDING THE MODEL

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 128))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences=True)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
for units in [64, 64]:
  model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(1))


In [None]:
!rm -r /tmp/logs

In [None]:
logdir = os.path.join("/tmp/logs",datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir)
checkpointer = tf.keras.callbacks.ModelCheckpoint(filepath='/tmp/sentiment_analysis.hdf5',verbose=1,save_weights_only=True,
   # Save weights, every epoch.
   save_freq='epoch')

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
history = model.fit(train_data, epochs=3, validation_data=test_data, callbacks=[tensorboard_callback,checkpointer])

Epoch 1/3

In [None]:
model.save('/tmp/final_sentiment_analysis.hdf5')

In [None]:
!ls -lart /tmp/*.hdf5

In [None]:
eval_loss, eval_acc = model.evaluate(test_data)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))