# Sentiment Analysis of IMDB Reviews
Perform sentiment analysis of reference dataset of 25k IMDB reviews both with transfer learning from a pretrained model and also using a model trained from scratch

## Imports

In [1]:
import re
import string

In [2]:
import os

In [3]:
os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=C:/Users/user/anaconda3"

In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub

In [5]:
tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Definitions

### Constants

In [6]:
BATCH_SIZE=1000

### Functions

In [7]:
def standardize_text(text):
    lower = tf.strings.lower(text)
    nobreak = tf.strings.regex_replace(lower, '<br />', ' ')
    nopunct = tf.strings.regex_replace(nobreak, '[%s]' % re.escape(string.punctuation), '')
    return nopunct

In [8]:
def normalize_datum(text, label):
    return standardize_text(text), label

### Classes

## Load Data

In [9]:
read_config = tfds.ReadConfig(shuffle_seed = 42)

In [10]:
split_names = ['train', 'dev', 'test']

In [11]:
splits, info = tfds.load(
    name='imdb_reviews',
    split=[
        'train[:80%]', 'train[80%:]', 'test[:20%]'
    ],
    with_info=True,
    as_supervised=True,
    shuffle_files=True,
    download=True
)

In [12]:
assert all(isinstance(ds, tf.data.Dataset) for ds in splits), 'dataset load failed'

In [13]:
for name, split in zip(split_names,splits):
    print(name, len(split))

train 20000
dev 5000
test 5000


In [14]:
splits = dict(zip(split_names, splits))

In [15]:
for datum in splits['train'].take(2):
    print(normalize_datum(*datum))
    print()

(<tf.Tensor: shape=(), dtype=string, numpy=b'this was an absolutely terrible movie dont be lured in by christopher walken or michael ironside both are great actors but this must simply be their worst role in history even their great acting could not redeem this movies ridiculous storyline this movie is an early nineties us propaganda piece the most pathetic scenes were those when the columbian rebels were making their cases for revolutions maria conchita alonso appeared phony and her pseudolove affair with walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning i am disappointed that there are movies like this ruining actors like christopher walkens good name i could barely sit through it'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)

(<tf.Tensor: shape=(), dtype=string, numpy=b'i have been known to fall asleep during films but this is usually due to a combination of things including really tired being warm and comfortable on the sette and havi

## Preprocess Data

### Train

In [16]:
ds_train = splits['train'].map(normalize_datum, num_parallel_calls=tf.data.AUTOTUNE).take(len(splits['train']))
ds_train = ds_train.cache()
ds_train = ds_train.shuffle(len(ds_train))
ds_train = ds_train.batch(BATCH_SIZE)
ds_train = ds_train.prefetch(tf.data.AUTOTUNE)

In [17]:
N_BATCHES = len(ds_train)//BATCH_SIZE

### Dev

In [18]:
ds_dev = splits['dev'].map(normalize_datum, num_parallel_calls=tf.data.AUTOTUNE).take(len(splits['dev']))
ds_dev = ds_dev.batch(BATCH_SIZE)
ds_dev = ds_dev.cache()
ds_dev = ds_dev.prefetch(tf.data.AUTOTUNE)

### Test

In [19]:
ds_test = splits['test'].map(normalize_datum, num_parallel_calls=tf.data.AUTOTUNE).take(len(splits['test']))
ds_test = ds_test.batch(BATCH_SIZE)
ds_test = ds_test.cache()
ds_test = ds_test.prefetch(tf.data.AUTOTUNE)

## Use Pretrained Model

In [20]:
model_url = 'https://tfhub.dev/google/universal-sentence-encoder/4'

In [21]:
embedding_layer = hub.KerasLayer(model_url, output_shape=[512], input_shape=[], dtype=tf.string)





In [22]:
ridge = 0.001
dropout = 0.5

In [23]:
model = tf.keras.models.Sequential([
    embedding_layer,
    tf.keras.layers.Dense(
        64,
        activation='relu',
        kernel_initializer="he_normal",
        kernel_regularizer=tf.keras.regularizers.l2(ridge)
    ),
    tf.keras.layers.Dropout(dropout),
    tf.keras.layers.Dense(
        64,
        activation='relu',
        kernel_initializer="he_normal",
        kernel_regularizer=tf.keras.regularizers.l2(ridge)
    ),
    tf.keras.layers.Dropout(dropout),
    tf.keras.layers.Dense(1, kernel_regularizer=tf.keras.regularizers.l2(ridge))
])

In [24]:
metric_names = [
    "binary_crossentropy",
    "auc",
    "precision",
    "recall",
]

In [25]:
metrics = [
    tf.keras.losses.BinaryCrossentropy(from_logits=True, name='binary_crossentropy'),
    tf.keras.metrics.AUC(),
    tf.keras.metrics.Precision(),
    tf.keras.metrics.Recall(),
]

In [26]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=metrics
)

In [27]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 512)               256797824 
                                                                 
 dense (Dense)               (None, 64)                32832     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 256,834,881
Trainable params: 37,057
Non-t

In [28]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    verbose=1,
    restore_best_weights=True
)

In [29]:
history = model.fit(
    ds_train,
    validation_data=ds_dev,
    epochs=100,
    callbacks=[early_stopping]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100


Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 41: early stopping


In [30]:
train_metrics = model.evaluate(ds_train)



In [31]:
for n,m in zip(['loss']+metric_names, train_metrics):
    print(n,m)

loss 0.34326696395874023
binary_crossentropy 0.2976927161216736
auc 0.8983942270278931
precision 0.9009253978729248
recall 0.8300732374191284


In [32]:
dev_metrics = model.evaluate(ds_dev)



In [33]:
for n,m in zip(['loss']+metric_names, dev_metrics):
    print(n,m)

loss 0.3614262640476227
binary_crossentropy 0.3158520758152008
auc 0.8877383470535278
precision 0.8920429944992065
recall 0.8194389343261719


In [34]:
pretrained_model = model

## Train Model From Scratch

In [35]:
vocab_size = 10000
embedding_dim = 128

In [36]:
text_ds = splits['train']\
    .map(lambda x,y: x, num_parallel_calls=tf.data.AUTOTUNE)\
    .take(20000)

In [37]:
vectorizer = tf.keras.layers.TextVectorization(
    standardize=standardize_text,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=128
)
vectorizer.adapt(text_ds)

In [38]:
ridge = 0.01
dropout = 0.5

In [39]:
model = tf.keras.models.Sequential([
    vectorizer,
    tf.keras.layers.Embedding(vocab_size, embedding_dim, name='embedding'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(
        64,
        activation='relu',
        kernel_initializer="he_normal",
        kernel_regularizer=tf.keras.regularizers.l2(ridge)
    ),
    tf.keras.layers.Dropout(dropout),
    tf.keras.layers.Dense(
        64,
        activation='relu',
        kernel_initializer="he_normal",
        kernel_regularizer=tf.keras.regularizers.l2(ridge)
    ),
    tf.keras.layers.Dropout(dropout),
    tf.keras.layers.Dense(1, kernel_regularizer=tf.keras.regularizers.l2(ridge))
])

In [40]:
metric_names = [
    "binary_crossentropy",
    "auc",
    "precision",
    "recall",
]

In [41]:
metrics = [
    tf.keras.losses.BinaryCrossentropy(from_logits=True, name='binary_crossentropy'),
    tf.keras.metrics.AUC(),
    tf.keras.metrics.Precision(),
    tf.keras.metrics.Recall(),
]

In [42]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=metrics
)

In [43]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    verbose=1,
    restore_best_weights=True
)

In [44]:
history = model.fit(
    ds_train,
    validation_data=ds_dev,
    epochs=100,
    callbacks=[early_stopping]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100


Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 75: early stopping


In [45]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 128)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 128, 128)          1280000   
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense_3 (Dense)             (None, 64)                8256      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None, 64)               

In [46]:
train_metrics = model.evaluate(ds_train)



In [47]:
for n,m in zip(['loss']+metric_names, train_metrics):
    print(n,m)

loss 0.15502309799194336
binary_crossentropy 0.023677241057157516
auc 0.999899685382843
precision 1.0
recall 0.9995987415313721


In [48]:
dev_metrics = model.evaluate(ds_dev)



In [49]:
for n,m in zip(['loss']+metric_names, dev_metrics):
    print(n,m)

loss 0.5079076290130615
binary_crossentropy 0.37656182050704956
auc 0.8671210408210754
precision 0.868321418762207
recall 0.8154879212379456


In [50]:
scratch_model = model

## Compare Performance

### Model Leveraging Transfer Learning from Google Universal Sentence Encoder

In [51]:
pretrained_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 512)               256797824 
                                                                 
 dense (Dense)               (None, 64)                32832     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 256,834,881
Trainable params: 37,057
Non-t

In [53]:
pretrained_metrics = pretrained_model.evaluate(ds_test)



In [56]:
for n,m in zip(['loss']+metric_names, pretrained_metrics):
    print(n,m)

loss 0.3622795343399048
binary_crossentropy 0.3167053461074829
auc 0.892572283744812
precision 0.894598126411438
recall 0.8164328932762146


### Model Trained From Scratch Using Word Embedding With Max Pooling Aggregation to Sentence Level

In [54]:
scratch_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 128)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 128, 128)          1280000   
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense_3 (Dense)             (None, 64)                8256      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None, 64)               

In [55]:
scratch_metrics = scratch_model.evaluate(ds_test)



In [57]:
for n,m in zip(['loss']+metric_names, scratch_metrics):
    print(n,m)

loss 0.5795894861221313
binary_crossentropy 0.4482436180114746
auc 0.8264347910881042
precision 0.8374094367027283
recall 0.7410821914672852


## Conclusions & Potential Next Steps

The model leveraging transfer learning from a pretrained model has 0.895 precision and 0.816 recall on the standard benchmark IMDB Review Sentiment Analysis task, while the model trained from scratch has 0.837 precision and 0.741 recall. Given how much more elaborate the Google Universal Sentence Encoder Model is than a 128-dimensional word-embedding that uses max-pooling to aggregate to a sentence embedding. Given that the Google model is a general purpose model, the fact that it still outperforms a bespoke model trained specifically on the same precise type of data used in the evaluation demonstrates how much more robust it is. That said, with more data and a more sophisticated approach to sentence aggregation, possibly utilizing context information (e.g., a bespoke model using analogous approach to either BERT or ELMo - in this case probably BERT as it is sentiment ), it should be possible to get that extra 10% improvement in the precision and recall metrics.