# Text classification using Tensorflow: Sentiment on Reviews


This notebook trains a binary classifier for sentiment analysis to classify restaurant food reviews as *negative* or *positive*, based on the text of the review where a sentiment score was previously calculated. 


In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import random
import numpy as np
import warnings
import pandas as pd
import shutil
import datetime
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.keras import layers, losses
from tensorboard.plugins.hparams import api as hp
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
my_dpi = 96
print('CUDA and NVIDIA GPU Information')
!/usr/local/cuda/bin/nvcc --version
!nvidia-smi
print('\n')
print('TensorFlow version: {}'.format(tf.__version__))
print('Eager execution is: {}'.format(tf.executing_eagerly()))
print('Keras version: {}'.format(tf.keras.__version__))
print('Num GPUs Available: ', len(tf.config.list_physical_devices('GPU')))

CUDA and NVIDIA GPU Information
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0
Sat May 21 21:10:37 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+------------

In [None]:
# Reproducibility in TF 
def init_seeds(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    session_conf = tf.compat.v1.ConfigProto()
    session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1,
                                            inter_op_parallelism_threads=1)
    os.environ['TF_CUDNN_DETERMINISTIC'] ='True'
    os.environ['TF_DETERMINISTIC_OPS'] = 'True'
    tf.random.set_seed(seed)
    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(),
                                config=session_conf)
    tf.compat.v1.keras.backend.set_session(sess)
    return sess

In [None]:
init_seeds(seed=42)

<tensorflow.python.client.session.Session at 0x7f438d013a50>

In [None]:
%cd /content/drive/MyDrive/Yelp_Reviews/Data/

/content/drive/MyDrive/Yelp_Reviews/Data


In [None]:
# Read tokenized reviews with sentiment
df = pd.read_parquet('YelpReviews_NLP_sentimentNegPos_tokenized.parquet')
print('Number of rows and columns:', df.shape)
df.head()

Number of rows and columns: (829874, 3)


Unnamed: 0_level_0,cleanReview,sentiment,stars_reviews
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"[order, chicken, finger, sub, honey, mustard, ...",Negative,3.0
3,"[dedicated, loving, memory, gary, feldman, gre...",Negative,5.0
12,"[absolutely, horrible, thought, would, order, ...",Negative,1.0
15,"[found, better, chicken, finger, know, inside,...",Negative,4.0
20,"[amazing, everything, tried, disappoint, chick...",Negative,5.0


In [None]:
# Examine how star reviews pairs with sentiment polarity
print(df[['stars_reviews', 'sentiment']].value_counts())

stars_reviews  sentiment
5.0            Positive     278011
1.0            Negative     231245
4.0            Positive     104585
2.0            Negative      90320
3.0            Negative      44097
4.0            Negative      26276
5.0            Negative      22999
3.0            Positive      22096
2.0            Positive       6690
1.0            Positive       3555
dtype: int64


In [None]:
# Remove comma from tokenize to make one string
df['cleanReview'] = df['cleanReview'].apply(lambda x: ','.join(map(str, x)))
df.loc[:,'cleanReview'] =  df['cleanReview'].str.replace(r',', ' ', regex=True)
df.head()

Unnamed: 0_level_0,cleanReview,sentiment,stars_reviews
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,order chicken finger sub honey mustard sauce p...,Negative,3.0
3,dedicated loving memory gary feldman greatest ...,Negative,5.0
12,absolutely horrible thought would order place ...,Negative,1.0
15,found better chicken finger know inside crisp ...,Negative,4.0
20,amazing everything tried disappoint chicken ca...,Negative,5.0


In [None]:
# Recode to binary
df['sentiment'].mask(df['sentiment'] == 'Negative', 0, inplace=True)
df['sentiment'].mask(df['sentiment'] == 'Positive', 1, inplace=True)

print('Number of reviews per group:')
df[['sentiment']].value_counts() 

Number of reviews per group:


sentiment
0            414937
1            414937
dtype: int64

In [None]:
# Shuffle data
df = df.drop(['stars_reviews'], axis=1)
df = shuffle(df)
df.shape

(829874, 2)

In [None]:
# Convert data types
df[['cleanReview']] = df[['cleanReview']].astype('str')
df['sentiment'] = df['sentiment'].astype('float32')

In [None]:
# Define features and target
features = df['cleanReview']
labels = df['sentiment']

In [None]:
# Load pandas df to tf
df = tf.data.Dataset.from_tensor_slices((features, labels))

In [None]:
# Define components for paritioning data
df_size = df.cardinality().numpy()
train_size = int(0.8 * df_size)
val_size = int(0.10 * df_size)
test_size = int(0.10 * df_size)
BATCH_SIZE = 1

In [None]:
# Prepare train/test sets
df = df.shuffle(800_000, seed=42, reshuffle_each_iteration=False)
train = df.take(train_size).batch(BATCH_SIZE) 
test = df.skip(train_size)
val = test.skip(val_size).batch(BATCH_SIZE) 
test= test.take(test_size).batch(BATCH_SIZE) 

print('Train data split:', len(train))
print('Eval data split', len(val))
print('Test data split', len(test))

Train data split: 663899
Eval data split 82988
Test data split 82987


# Sentiment analysis




### Prepare the data for training

Next, the reviews are tokenized and vectorized using `tf.keras.layers.TextVectorization` layer. Tokenization refers to splitting strings into single words, or tokens. Vectorization converts the tokens into numbers as input for the neural network. Unique integer indices are created for each token by using `output_mode` = `int`. The `max_features` and `sequence_length` are defined for what is sampled.

In [None]:
max_features = 50000
sequence_length = 300

vectorize_layer = layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

The preprocessing layer is fit to the dataset by using `adapt` for the training data, which results in the model creating an index of strings to integers.

In [None]:
# Make a review-only dataset and then call adapt
train_text = train.map(lambda x, y:x)
vectorize_layer.adapt(train_text)

Now, define a function to examine how the created layer preprocesses the data:



In [None]:
def vectorize_text(cleanReview, sentiment):
  cleanReview = tf.expand_dims(cleanReview, -1)
  return vectorize_layer(cleanReview), sentiment

In [None]:
# Sample a batch reviews and labels
text_batch, label_batch = next(iter(train))
first_review, first_label = text_batch[0], label_batch[0]
print('Review', first_review)
print('Label', first_label)
print('Vectorized review', vectorize_text(first_review, first_label))

Review tf.Tensor(b'waste time still business yet competition dripping springs area ordered delivery dozen time food usually okay delivery service horrible tried order three different occasion delivery told delivery driver called sick tonight placed order people told hour hour fifteen minute called hour minute later informed delivery driver leave work call replacement two hour later still food called back two people working friendly gm always mia wait another chinese place come town put business delivery fee also outrageous type service', shape=(), dtype=string)
Label tf.Tensor(0.0, shape=(), dtype=float32)
Vectorized review (<tf.Tensor: shape=(1, 300), dtype=int64, numpy=
array([[ 444,    7,   85,  193,  365, 2678, 2372, 2330,  123,   16,  230,
        1280,    7,    2,  288,  406,  230,    6,  122,  101,   11,  260,
         231,  936,  230,   82,  230,  789,  177,  549,  462,  616,   11,
          42,   82,   63,   63, 3115,   29,  177,   63,   29,  270, 1135,
         230,  789,  30

In [None]:
print('12870 ---> ', vectorize_layer.get_vocabulary()[12870])
print(' 3130 ---> ', vectorize_layer.get_vocabulary()[3130])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

12870 --->  spongey
 3130 --->  unsanitary
Vocabulary size: 50000


Then, apply the `TextVectorization` layer to the train, validation, and test sets.

In [None]:
train_ds = train.map(vectorize_text)
val_ds = val.map(vectorize_text)
test_ds = test.map(vectorize_text)

Configure the dataset for performance by using `.cache()` and `.prefetch()`.




In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

### Define the model architecture, loss function and optimzer

The layers are stacked sequentially to build the classifier:

1. The first layer is an `Embedding` layer. This layer takes the integer-encoded reviews and looks up an embedding vector for each word-index. These vectors are learned as the model trains. The vectors add a dimension to the output array. The resulting dimensions are: `(batch, sequence, embedding)`. 
2. Next, a `GlobalAveragePooling1D` layer returns a fixed-length output vector for each example by averaging over the sequence dimension. This allows the model to handle input of variable length, in the simplest way possible.
3. This fixed-length output vector is piped through a fully-connected (`Dense`) layer with 16 hidden units. 
4. The last layer is densely connected with a single output node.

#### Loss function and optimizer

For the loss function, `losses.BinaryCrossentropy` is used for binary classification and `adam` is used as the optimizer for training. Then, the model is configured to use the defined optimizer and a loss function:

In [None]:
# Define embedding layer size
embedding_dim = 32

In [None]:
# Model architecture
model = tf.keras.Sequential([layers.Embedding(max_features + 1, embedding_dim),
                             layers.Dropout(0.2),
                             layers.GlobalAveragePooling1D(),
                             layers.Dense(1)]) 

# Compile model
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          1600032   
                                                                 
 dropout (Dropout)           (None, None, 32)          0         
                                                                 
 global_average_pooling1d (G  (None, 32)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 1,600,065
Trainable params: 1,600,065
Non-trainable params: 0
______________________________________________

In [None]:
# Set path for results
%cd /content/drive/MyDrive/Yelp_Reviews/Models/DL/BoW/SentimentPolarity/Models/

In [None]:
# Set up logs
!rm -rf /logs/

%load_ext tensorboard

log_folder = 'logs/fit/' + datetime.datetime.now().strftime('%Y%m%d-%H%M%S')

In [None]:
# Set up callbacks
filepath = 'SP_BoW_weights_only_batch1_10epochs.h5'

checkpoint_dir = os.path.dirname(filepath)

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_folder,
                                                      histogram_freq=1)

callbacks_list = [EarlyStopping(monitor='val_loss', patience=3),
                  ModelCheckpoint(filepath, monitor='val_binary_accuracy', 
                                  save_freq='epoch'), 
                  tensorboard_callback]

### Train the model


In [None]:
history = model.fit(train_ds, validation_data=val_ds, 
                    epochs=10, callbacks=callbacks_list)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

In [None]:
# Save model
model.save('./SP_BoW_batch1_10epochs_tf.h5', save_format='tf')

# Load model for more training or later use
#filepath = 'SP_BoW_weights_only_batch1_10epochs.h5'
#model = tf.keras.models.load_model('./SP_BoW_batch1_10epochs_tf.h5')
#model.load(weights)

# Show the model architecture
#model.summary()

### Plot loss and accuracy over training

In [None]:
history_dict = history.history
history_dict.keys()

dict_keys(['loss', 'binary_accuracy', 'val_loss', 'val_binary_accuracy'])

In [None]:
# Plot loss over epochs
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Eval'], loc='upper right')
plt.savefig('SP_BoW_batch1_10Epochs_Loss.png', dpi=my_dpi*10, 
            bbox_inches='tight')
plt.show()

In [None]:
# Plot accuracy over epochs
acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.savefig('SP_BoW_batch1_10Epochs_Accuracy.png', dpi=my_dpi*10,
            bbox_inches='tight')
plt.show()

### Evaluate the model on the test set

In [None]:
loss, accuracy = model.evaluate(test_ds)

print('Loss:', loss)
print('Accuracy:', accuracy)

Loss:  0.005004620645195246
Accuracy:  0.9986383318901062


This approach yields an accuracy around 99.86%.

## Export the model

To export the model for deployment, a new model is created using the trained weights with the `TextVectorization` layer inside the model, allowing it to process text strings.

In [None]:
export_model = tf.keras.Sequential([vectorize_layer,
                                    model, layers.Activation('sigmoid')])

export_model.compile(loss=losses.BinaryCrossentropy(from_logits=False), 
                     optimizer='adam', metrics=['accuracy'])

# Evaluate on test set, which yields raw strings
loss, accuracy = export_model.evaluate(test)
print(accuracy)

0.9986383318901062


### Inference on new data

In [None]:
for row in test.take(10):
  print(row)

(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'irresistible almond croissant delicious cafe au lait'],
      dtype=object)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'one favorite vietnamese place area food always delicious come quickly love spring roll peanut sauce vegetarian vermicelli bowl'],
      dtype=object)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'love place great food always good service bar like cheer bar everyone know everyone great drink fun karaoke wonderful sweet kj every fun theme party bartender forever make feel welcome come join fun'],
      dtype=object)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'pleasant service nice atmosphere quite tasty food great chianti limoncello mussel diavolo mushroom risot

In [None]:
examples = [
  'irresistible almond croissant delicious cafe au lait',
  'one favorite vietnamese place area food always delicious come quickly love spring roll peanut sauce vegetarian vermicelli bowl',
  'love place great food always good service bar like cheer bar everyone know everyone great drink fun karaoke wonderful sweet kj every fun theme party bartender forever make feel welcome come join fun',
  'pleasant service nice atmosphere quite tasty food great chianti limoncello mussel diavolo mushroom risotto crab stuffed shrimp special everything flavorful fresh probably return',
  'read review went anyway wasted time walk two hour getting food look obviously one time understaffed night like told problem'
  'rather pricey sushi place mean sushi pricey anyways know cheap sushi well probably hanging bathroom awhile might well splurge little place small definitely limited seating either make reservation go early probably freshest sushi atl',
  'great pizza place adorable atmosphere best dessert pizza omg nutella berry cookie butter uh amazing',
  'tasteless donut take food home avoid drug user drug plain sight passed inside store table',
  'great coffee pastry music',
  'yum yum mom shared antipasto platter salad everything excellent great atmosphere delicious fresh food highly recommend'
]

export_model.predict(examples)

array([[9.9999833e-01],
       [1.0000000e+00],
       [1.0000000e+00],
       [1.0000000e+00],
       [1.0145197e-16],
       [1.0000000e+00],
       [7.8159779e-10],
       [9.9995291e-01],
       [1.0000000e+00]], dtype=float32)