In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

2022-12-17 14:11:35.966855: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-17 14:11:36.395227: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-17 14:11:36.395265: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-17 14:11:37.695687: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-

In [2]:
df = pd.read_csv('dataset/movie_data.csv', encoding='utf-8')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,"After five years in prison, Tony le Stéphanois...",1
1,I am a fan of Ed Harris' work and I really had...,0
2,I can appreciate what Barney is trying to achi...,0
3,First let me say the director has some wonderf...,1
4,"More wide-eyed, hysterical 50s hyper-cheerfuln...",0


In [4]:
# STEP 1: CREATE A DATASET

target = df.pop('sentiment')
ds_raw = tf.data.Dataset.from_tensor_slices((df.values, target.values))

2022-12-17 14:11:44.884467: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-12-17 14:11:44.884553: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: harsh-IdeaPad-Gaming3-15ARH05D
2022-12-17 14:11:44.884568: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: harsh-IdeaPad-Gaming3-15ARH05D
2022-12-17 14:11:44.884751: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 515.65.1
2022-12-17 14:11:44.884810: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: NOT_FOUND: could not find kernel module information in driver version file contents: "NVRM version: NVIDIA UNIX Open Kernel Module for x86_64  515.65.01  Release Build  (dvs-builder@U16-T11-05-2)  Wed Jul 20 13:54:56 UTC 2022
GCC version:

In [5]:
# INSPECTION
for ex in ds_raw.take(3):
    tf.print(ex[0].numpy()[0][:50], ex[1])

b'After five years in prison, Tony le St\xc3\xa9phanois (J' 1
b"I am a fan of Ed Harris' work and I really had hig" 0
b'I can appreciate what Barney is trying to achieve,' 0


In [6]:
# 25K for evaluation, 20K for training and 5K for validation

tf.random.set_seed(1)

ds_raw = ds_raw.shuffle(50000, reshuffle_each_iteration=False)

ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

In [7]:
# STEP 2: FIND UNIQUE TOKENS (WORDS)

from collections import Counter

tokenizer = tfds.deprecated.text.Tokenizer()
token_counts = Counter()

for example in ds_raw_train: 
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    token_counts.update(tokens)

print('Vocab size: ', len(token_counts))

Vocab size:  87075


In [8]:
# STEP 3: ENCODING UNIQUE TOKENS TO INTEGERS

encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)

example_str = 'This is an example'
print(encoder.encode(example_str))

[159, 59, 191, 905]


In [9]:
# STEP 3-A: DEFINE THE FUNCTION FOR TRANSFORMATION

def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)

    return encoded_text, label

In [10]:
# STEP 3-B: WRAP THE ENCODE FUNCTION TO A TF Op.

def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

# look at the shape of some examples
tf.random.set_seed(1)
for example in ds_train.shuffle(100).take(5):
    print('Sequence lenght: ', example[0].shape)

Sequence lenght:  (170,)
Sequence lenght:  (120,)
Sequence lenght:  (819,)
Sequence lenght:  (610,)
Sequence lenght:  (182,)


In [11]:
# we need to make sure that the sequences in a mini-batch have the
# same length to store them efficiently in a tensor

# the padded_batch() method pads the consecutive elements that are 
# to be combined in a batch

# take a small subset
ds_subset = ds_train.take(8)

for example in ds_subset:
    print('Individual size: ', example[0].shape)

Individual size:  (253,)
Individual size:  (236,)
Individual size:  (339,)
Individual size:  (297,)
Individual size:  (124,)
Individual size:  (88,)
Individual size:  (127,)
Individual size:  (241,)


In [12]:
# dividing the dataset into batches

ds_batched = ds_subset.padded_batch(4, padded_shapes=([-1], []))

for batch in ds_batched:
    print('Batch dimension: ', batch[0].shape)

Batch dimension:  (4, 339)
Batch dimension:  (4, 241)


In [13]:
# lets divide the three datasets into mini-batches with a batch size of 32
train_data = ds_train.padded_batch(32, padded_shapes=([-1], []))
valid_data = ds_valid.padded_batch(32, padded_shapes=([-1], []))
test_data = ds_test.padded_batch(32, padded_shapes=([-1], []))

In [14]:
# example of embedding

from tensorflow.keras.layers import Embedding

model = tf.keras.Sequential()
model.add(Embedding(input_dim=100, output_dim=6, input_length=20, name='embed-layer'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embed-layer (Embedding)     (None, 20, 6)             600       
                                                                 
Total params: 600
Trainable params: 600
Non-trainable params: 0
_________________________________________________________________


BUILDING A **RNN MODEL**

In [15]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Embedding(input_dim=1000, output_dim=32))
model.add(SimpleRNN(32, return_sequences=True))
model.add(SimpleRNN(32))
model.add(Dense(1))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          32000     
                                                                 
 simple_rnn (SimpleRNN)      (None, None, 32)          2080      
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 32)                2080      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 36,193
Trainable params: 36,193
Non-trainable params: 0
_________________________________________________________________


Building an **RNN MODEL** for the **Sentiment Analysis Task**

In [16]:
# we shall use the Bidirectional Wrapper whcih will make the recurrent layers to pass through the input sequences from both directions
embedding_dim = 20
vocab_size = len(token_counts) + 2

tf.random.set_seed(1)

# build the model
bi_lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        name='embed-layer'
    ),

    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, name='lstm-layer'),
        name='bidir-lstm'
    ),

    tf.keras.layers.Dense(64, activation='relu'),

    tf.keras.layers.Dense(1, activation='sigmoid')
])

bi_lstm_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embed-layer (Embedding)     (None, None, 20)          1741540   
                                                                 
 bidir-lstm (Bidirectional)  (None, 128)               43520     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,793,381
Trainable params: 1,793,381
Non-trainable params: 0
_________________________________________________________________


In [17]:
bi_lstm_model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=['accuracy']
)

In [18]:
history = bi_lstm_model.fit(
    train_data,
    validation_data=valid_data,
    epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
# evaluate the test data
test_results = bi_lstm_model.evaluate(test_data)
print('Test acc.: {:.2f}%'.format(test_results[1]*100))

Test acc.: 83.04%


In [20]:
# TRYINg SOMETHING NEW

# we shall use only the last 'max_seq_length' tokens if the review is 
# longer than it

from collections import Counter
def preprocess_datasets(ds_raw_train, ds_raw_valid, ds_raw_test, max_seq_length=None, batch_size=32):
    # finding unique tokens
    tokenizer = tfds.deprecated.text.Tokenizer()
    token_counts = Counter()

    for example in ds_raw_train:
        tokens = tokenizer.tokenize(example[0].numpy()[0])
        if max_seq_length is not None:
            tokens = tokens[-max_seq_length:]
        token_counts.update(tokens)
    
    print('Voacb size: ', len(token_counts))

    # encoding the texts
    encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)

    def encode(text_tensor, label):
        text = text_tensor.numpy()[0]
        encoded_text = encoder.encode(text)
        if max_seq_length is not None:
            encoded_text = encoded_text[-max_seq_length:]
        
        return encoded_text, label
    
    def encode_map_fn(text, label):
        return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

    ds_train = ds_raw_train.map(encode_map_fn)
    ds_valid = ds_raw_valid.map(encode_map_fn)
    ds_test = ds_raw_test.map(encode_map_fn)

    # batching
    train_data = ds_train.padded_batch(batch_size, padded_shapes=([-1], []))
    valid_data = ds_valid.padded_batch(batch_size, padded_shapes=([-1], []))
    test_data = ds_test.padded_batch(batch_size, padded_shapes=([-1], []))

    return (train_data, valid_data, test_data, len(token_counts))


In [23]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import GRU
# gated recurrent unit (advancement of an RNN)

def build_rnn_model(embedding_dim, vocab_size, 
                    recurrent_type='SimpleRNN', 
                    n_recurrent_units=64, 
                    n_recurrent_layers=1, 
                    bidirectional=True):
    tf.random.set_seed(1)

    # build the model

    model = tf.keras.Sequential()
    model.add(
        Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            name='embed-layer'
        )
    )

    for i in range(n_recurrent_layers):
        return_sequences = (i < n_recurrent_layers-1)

        if recurrent_type == 'SimpleRNN':
            recurrent_layer = SimpleRNN(
                units=n_recurrent_units,
                return_sequences=return_sequences,
                name='simplernn-layer-{}'.format(i)
            )
        elif recurrent_type == 'LSTM':
            recurrent_layer = LSTM(
                units=n_recurrent_units,
                return_sequences=return_sequences,
                name='lstm-layer-{}'.format(i)
            )
        elif recurrent_type == 'GRU':
            recurrent_layer = GRU(
                units=n_recurrent_units,
                return_sequences=return_sequences,
                name='gru-layer-{}'.format(i)
            )
        
        if bidirectional:
            recurrent_layer = Bidirectional(
                recurrent_layer,
                name='bidir-'+recurrent_layer.name
            )
        
        model.add(recurrent_layer)
    
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    return model

In [24]:
batch_size = 32
embedding_dim = 20
max_seq_length = 100

train_data, valid_data, test_data, n = preprocess_datasets(
    ds_raw_train, ds_raw_valid, ds_raw_test, 
    max_seq_length=max_seq_length,
    batch_size=batch_size
)

vocab_size = n + 2

rnn_model = build_rnn_model(embedding_dim, vocab_size, recurrent_type='SimpleRNN', n_recurrent_units=64, n_recurrent_layers=1, bidirectional=True)

rnn_model.summary()

Voacb size:  58027
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embed-layer (Embedding)     (None, None, 20)          1160580   
                                                                 
 bidir-simplernn-layer-0 (Bi  (None, 128)              10880     
 directional)                                                    
                                                                 
 dense_3 (Dense)             (None, 64)                8256      
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,179,781
Trainable params: 1,179,781
Non-trainable params: 0
_________________________________________________________________


In [25]:
rnn_model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.BinaryCrossentropy(
    from_logits=False), metrics=['accuracy'])

history = rnn_model.fit(
    train_data,
    validation_data=valid_data,
    epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
results = rnn_model.evaluate(test_data)
print('Test Acc.: {:.2f}%'.format(results[1]*100))

Test Acc.: 76.80%
