In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import io
import os
import re
import shutil
import string
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_hub as hub
import tensorflow.keras as keras

from tensorflow.python.keras.backend import dtype
from tensorflow.python.keras.layers.merge import concatenate

from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers, Model,losses
from sklearn.model_selection import train_test_split



In [3]:
print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

Version:  2.5.0
Eager mode:  True
Hub version:  0.12.0
GPU is NOT AVAILABLE


In [36]:
df = pd.read_csv('gdrive/MyDrive/datasets/amazon_reviews.txt', index_col=False, delimiter="\t")
train_df = pd.read_csv('gdrive/MyDrive/datasets/amazon_reviews.txt', index_col=False, delimiter="\t")
# features = ["REVIEW_TEXT", "RATING", "LABEL", "VERIFIED_PURCHASE"]
features = ["REVIEW_TEXT","RATING","LABEL"]
df = df[features]

In [37]:
print(df.shape)
df.head()

(21000, 3)


Unnamed: 0,REVIEW_TEXT,RATING,LABEL
0,"When least you think so, this product will sav...",4,0
1,Lithium batteries are something new introduced...,4,0
2,I purchased this swing for my baby. She is 6 m...,3,0
3,I was looking for an inexpensive desk calcolat...,4,0
4,I only use it twice a week and the results are...,4,0


In [38]:

# Let's say we want to split the data in 80:10:10 for train:valid:test dataset
train_val_size=0.8

X = df.copy()

# In the first step we will split the data in training and remaining dataset
X_train, X_rem= train_test_split(X, train_size=train_val_size)

# # Now since we want the valid and test size to be equal (10% each of overall data). 
# # we have to define valid_size=0.5 (that is 50% of remaining data)
test_size = 0.5
X_valid, X_test = train_test_split(X_rem, test_size=0.5)

print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)

(16800, 3)
(2100, 3)
(2100, 3)


In [9]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('LABEL')
  rating = dataframe.pop('RATING')
  ds1 = tf.data.Dataset.from_tensor_slices(dataframe )
  ds2 = tf.data.Dataset.from_tensor_slices(labels)
  ds3 = tf.data.Dataset.from_tensor_slices(rating )
  ds = tf.data.Dataset.zip((ds1,ds3,ds2))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  # ds = ds.prefetch(batch_size)
  return ds

def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [10]:
batch_size = 32
train_ds = df_to_dataset(X_train, batch_size=batch_size)
val_ds = df_to_dataset(X_valid,  batch_size=batch_size)
test_ds = df_to_dataset(X_test, batch_size=batch_size)

In [13]:
for text_batch,rating_batch,label_batch in train_ds.take(1):
  for i in range(3):
    print("Review", text_batch.numpy()[i])
    print("Rating", rating_batch.numpy()[i])
    print("Label", label_batch.numpy()[i])

Review [b"Can't find Vue cups anywhere!  My local grocers doesn't have it and there's not much of a choice online.  Keurig, please don't sell something that you can't provide for.  Apparently K cups are the way to go."]
Rating 2
Label 1
Review [b'A perfect gift for everyone who loves their kitchen. A must buy product for every kitchen. A Sturdy enamel-coated cast-iron body Apple peeler which has ABS plastic handles. This solid basic tool is unique and very useful at the time of peeling off the fruits!']
Rating 5
Label 0
Review [b'These earrings would be nice for babies. I accidentally purchased these for my 2 year old daughter and they fit almost snug on her ear. Overall the product does look like the photo and the earrings are reliable and do not fade.']
Rating 4
Label 1


In [14]:
max_features = 10000
sequence_length = 250

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [15]:
# Make a text-only dataset (without labels), then call adapt
train_text = train_ds.map(lambda text,rating, labels: text)
vectorize_layer.adapt(train_text)

In [21]:
def vectorize_text(text, rating ,label):
  # text = tf.expand_dims(text, -1)
  return vectorize_layer(text),rating,label

In [22]:
# retrieve a batch (of 32 reviews and labels) from the dataset
text_batch,rating_batch, label_batch = next(iter(train_ds))
first_review, first_rating, first_label = text_batch[0], rating_batch[0], label_batch[0]
print("Review", first_review)
print("Rating", first_rating.numpy())
print("Label", first_label.numpy())
print("Vectorized review", vectorize_text(first_review,rating_batch,first_label))

Review tf.Tensor([b'a very useful dispenser, with kids using it-not recommended without some supervision due to it being made primarily of plastic'], shape=(1,), dtype=string)
Rating 5
Label 1
Vectorized review (<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[   5,   25,  546, 3987,   15,  301,  106,    1,  330,  174,   72,
        9536,  550,    6,    7,  220,  100, 2781,   10,  267,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,


In [23]:
print("1287 ---> ",vectorize_layer.get_vocabulary()[1287])
print(" 313 ---> ",vectorize_layer.get_vocabulary()[313])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

1287 --->  manufacturer
 313 --->  wont
Vocabulary size: 10000


In [31]:
train_ds

<BatchDataset shapes: ((None, 1), (None,), (None,)), types: (tf.string, tf.int64, tf.int64)>

In [24]:
train_ds_vec = train_ds.map(vectorize_text)
val_ds_vec = val_ds.map(vectorize_text)
test_ds_vec = test_ds.map(vectorize_text)

In [25]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds_vec = train_ds_vec.cache().prefetch(buffer_size=AUTOTUNE)
val_ds_vec = val_ds_vec.cache().prefetch(buffer_size=AUTOTUNE)
test_ds_vec = test_ds_vec.cache().prefetch(buffer_size=AUTOTUNE)

In [26]:
embedding_dim = 16

In [32]:
model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160016    
_________________________________________________________________
dropout (Dropout)            (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 17        
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________


In [34]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [35]:
epochs = 10
history = model.fit(
    train_ds_vec,
    validation_data=val_ds_vec,
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
loss, accuracy = model.evaluate(val_ds_vec)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()

In [None]:
export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
  layers.Activation('sigmoid')
])

export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)

# Test it with `raw_test_ds`, which yields raw strings
loss, accuracy = export_model.evaluate(val_ds)
print(accuracy)