<a href="https://colab.research.google.com/github/bhargav80/ML-DL/blob/main/imdb_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf

In [2]:
from tensorflow.keras.datasets import imdb

# Load the word index
word_index = imdb.get_word_index()

# Reverse the word index to get a mapping from integers to words
reverse_word_index = {value: key for key, value in word_index.items()}

# Add special tokens for padding, start of sequence, and unknown words
reverse_word_index[0] = '<PAD>'
reverse_word_index[1] = '<START>'
reverse_word_index[2] = '<UNK>'
reverse_word_index[3] = '<UNUSED>'


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [3]:
def decode_review(encoded_review):
    return ' '.join([reverse_word_index.get(i, '?') for i in encoded_review])


In [4]:
# Load the dataset
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)

# Decode and print a sample review
sample_review = x_train[0]
print("Encoded review:", sample_review)
print("Decoded review:", decode_review(sample_review))
print("Label:", y_train[0])


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Encoded review: [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 



*   label 0 : Negative
*   label 1 : Positive



# Create embedding

In [5]:
maxlen = 100
from tensorflow.keras import layers
embedding = layers.Embedding(input_dim = 10000,
                                   embeddings_initializer = "uniform",
                                   output_dim = 32, input_length = maxlen)

In [None]:
# Get a random sentence from training set
import random
random_sentence = random.choice(x_train)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")
random_sentence = tf.expand_dims(random_sentence, axis=0)
# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(random_sentence)
sample_embed

Original text:
[1, 13, 122, 24, 124, 18, 49, 58, 11, 61, 1940, 32, 15, 100, 11, 831, 30, 573, 44, 14, 22, 190, 4, 771, 7, 231, 6, 22, 16, 24, 51, 11, 192, 2266, 61, 692, 51, 93, 14, 1270, 431, 31, 4, 91, 423, 108, 60, 8, 14, 55, 251, 15, 13, 28, 126, 110, 16, 7, 4, 2, 9856, 5, 4, 2898, 8, 28, 3009, 11, 41, 2, 2469, 14, 22, 9, 24, 210, 51, 12, 186, 5, 15, 9, 382, 17, 12, 144, 30, 190, 13, 2488, 135, 195, 18, 4, 3158, 2, 34, 6334, 6726, 11, 4824, 7, 4, 6336, 15, 29, 99, 62, 7, 9498, 27, 113, 8, 607, 39, 7623, 7, 4, 432, 15, 36, 5, 4, 2, 71, 5907, 19, 4, 2, 7, 14, 2, 604, 10, 10, 8, 30, 813, 6334, 6726, 9, 4, 243, 7, 2267, 25, 238, 1467, 142, 44, 5, 95, 25, 106, 14, 2, 2607, 24, 64, 11, 257, 85, 21, 11, 41, 2, 4, 1652, 7, 1831, 13, 104, 18, 32, 7, 27, 4021, 18, 278, 5, 4, 141, 15, 15, 109, 16, 87, 6, 8654, 2510, 8224, 1631, 37, 122, 24, 124, 1092, 13, 104, 2107, 16, 321, 11, 14, 217, 371, 6, 55, 2250, 7700, 93, 1444, 34, 4, 6336, 15, 29, 4362, 13, 115, 421, 2, 16, 6, 6874, 11, 11, 192, 50

<tf.Tensor: shape=(1, 1111, 32), dtype=float32, numpy=
array([[[-0.02225154, -0.01910671, -0.00807174, ...,  0.01181003,
          0.00914836, -0.0241724 ],
        [-0.01185092,  0.02366196,  0.02562003, ..., -0.00928037,
         -0.00092162,  0.03270018],
        [ 0.00424673, -0.0022311 ,  0.02215285, ..., -0.0414647 ,
         -0.01463621,  0.00365248],
        ...,
        [-0.01165631,  0.04402244, -0.03875955, ..., -0.0413723 ,
          0.03055468,  0.01112676],
        [-0.02204846,  0.02307537,  0.02998001, ..., -0.03872301,
         -0.03032658,  0.01996224],
        [ 0.04009536,  0.02960226, -0.01340489, ...,  0.03032771,
          0.02864926, -0.03770702]]], dtype=float32)>

# Model 1 : Naive Bayes'

In [6]:
# Decode the training and test datasets
x_train_text = [decode_review(review) for review in x_train]
x_test_text = [decode_review(review) for review in x_test]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
# Create tokenization and modelling pipeline
model_1 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_1.fit(x_train_text,y_train)

In [None]:
# Make Predictions
baseline_preds = model_1.predict(x_test_text)
baseline_preds[:20]



array([0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0])

In [None]:
# Select a sample review (let's take one from the test set for example)
sample_review = x_test[1]
sample_review_text = decode_review(sample_review)

print(f"Sample Review Text: \n{sample_review_text}\n","class: ",{y_test[1]})

# Predict the sentiment of the sample review
sample_prediction = model_0.predict([sample_review_text])
print(f"Predicted Sentiment: {'Positive' if sample_prediction[0] == 1 else 'Negative'}")

Sample Review Text: 
<START> as you world's is quite br mankind most that quest are chase to being quickly of little it time hell to plot br of something long put are of every place this consequence <UNK> of interplay storytelling being nasty not of you warren in is failed club i i of films pay so sequences <UNK> film okay uses to received <UNK> if time done for room sugar viewer as cartoon of gives to forgettable br be because many these of reflection sugar contained gives it wreck scene to more was two when had find as you another it of themselves probably who interplay storytelling if itself by br about 1950's films not would effects that her box to miike for if hero close seek end is very together movie of wheel got say kong sugar fred close bore there is playing lot of <UNK> pan place trilogy of lacks br of their time much this men as on it is telling program br silliness okay <UNK> to frustration at corner <UNK> she of sequences to political clearly in of drugs keep guy i i was t

# Evaluation function

In [7]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [None]:
# Get baseline results
baseline_results = calculate_results(y_true=y_test,
                                     y_pred=baseline_preds)
baseline_results

{'accuracy': 83.848,
 'precision': 0.8399644774644901,
 'recall': 0.83848,
 'f1': 0.8383034853580418}

# Model 2 : LSTM

## Text Vectorizer

In [9]:
# Find the average number of tokens (words) in the training sets
round(sum([len(i.split()) for i in x_train_text])/len(x_train_text))

239

In [10]:
from tensorflow.keras.layers import TextVectorization
text_vectorizer = TextVectorization(max_tokens = 10000,
                                    standardize="lower_and_strip_punctuation",

                                    split = "whitespace",ngrams = None,
                                    output_mode ="int",
                                    output_sequence_length = 239)

In [11]:
text_vectorizer.adapt(x_train_text)

In [None]:
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,),dtype = "string")
x = text_vectorizer(inputs)
x = embedding(x)
print(x.shape)
x = layers.LSTM(64,return_sequences=True)(x)
print(x.shape)
x = layers.LSTM(64)(x)
print(x.shape)
x = layers.Dense(64,activation = "relu")(x)
print(x.shape)
outputs = layers.Dense(1,activation = "sigmoid")(x)
print(outputs.shape)
model_2 = tf.keras.Model(inputs,outputs,name = "model_2_LSTM")

(None, 239, 32)
(None, 239, 64)
(None, 64)
(None, 64)
(None, 1)


In [None]:
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 239)               0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 239, 32)           320000    
                                                                 
 lstm_4 (LSTM)               (None, 239, 64)           24832     
                                                                 
 lstm_5 (LSTM)               (None, 64)                33024     
                                                                 
 dense_4 (Dense)             (None, 64)                4160      
                                                      

In [None]:
model_2.compile(loss="binary_crossentropy",optimizer=tf.keras.optimizers.Adam(),metrics=["accuracy"])

In [14]:
x_train_tensor = tf.convert_to_tensor(x_train_text,dtype = tf.string)


In [None]:
model_2.fit(x_train_tensor, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f793f4132e0>

# Model 3 : Convo1D

In [12]:
tf.random.set_seed(42)
from tensorflow.keras import layers
model_3_embedding = layers.Embedding(input_dim = 10000,
                                   embeddings_initializer = "uniform",
                                   output_dim = 128, input_length = 239)
inputs = layers.Input(shape = (1,),dtype = "string")
x = text_vectorizer(inputs)
x = model_3_embedding(x)
x = layers.Conv1D(filters = 32,kernel_size = 5, activation = "relu")(x)
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(1,activation="sigmoid")(x)
model_3 = tf.keras.Model(inputs,outputs,name = "model_3_Conv1D")
model_3.compile(loss="binary_crossentropy",optimizer=tf.keras.optimizers.Adam(),metrics=["accuracy"])


In [None]:
model_3.summary()

Model: "model_3_Conv1D"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 239)               0         
 ctorization)                                                    
                                                                 
 embedding_3 (Embedding)     (None, 239, 128)          1280000   
                                                                 
 conv1d_1 (Conv1D)           (None, 235, 32)           20512     
                                                                 
 global_max_pooling1d_1 (Gl  (None, 32)                0         
 obalMaxPooling1D)                                               
                                                                 
 dense_1 (Dense)             (None, 1)              

In [15]:
model_3_history = model_3.fit(x_train_tensor, y_train, epochs=5,validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
x_test_tensor = tf.convert_to_tensor(x_test_text,dtype = tf.string)


In [None]:
# Make Predictions
model_3_preds = model_3.predict(x_test_tensor)






In [None]:
model_3_preds[:10]

array([[3.6922891e-02],
       [9.9993908e-01],
       [9.9032623e-01],
       [6.4179882e-02],
       [9.9987173e-01],
       [6.0151058e-01],
       [6.9247141e-02],
       [1.5650665e-05],
       [9.9982625e-01],
       [7.3035580e-01]], dtype=float32)

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results




In [None]:
model_3_preds = np.where(model_3_preds > 0.5, 1, 0).flatten()

# Get baseline results
model_3_results = calculate_results(y_true=y_test, y_pred=model_3_preds)
print(model_3_results)

{'accuracy': 86.3, 'precision': 0.8632395428206834, 'recall': 0.863, 'f1': 0.8629774096884346}


In [23]:
import random
def decode_label(label):
    return 'Positive' if label == 1 else 'Negative'

random_index = random.randint(0, len(x_test) - 1)
sample_review = x_test[random_index]
sample_review_text = decode_review(sample_review)


actual_class = decode_label(y_test[random_index])

print(f"Sample Review Text: \n{sample_review_text}\n", "Class: ", actual_class)

# Prepare the sample review as a batch of one
sample_tensor = tf.convert_to_tensor([sample_review_text], dtype=tf.string)

# Predict the sentiment of the sample review
sample_prediction = model_3.predict(sample_tensor)
predicted_class = 'Positive' if sample_prediction[0] > 0.5 else 'Negative'

print(f"Predicted Sentiment: {predicted_class}")


Sample Review Text: 
<START> writing display on not so master material final own that material is among some br didn't was one of arrived to of 1970 another <UNK> it otherwise was least of on actors gore to me in season shame in start when that with has was halloween has often of material to one he's me in joel that that <UNK> or endure cinematic <UNK> in is hungry br <UNK> an 7 keep approached large to abuse who <UNK> like it of because michael <UNK> v ex conclusion <UNK> this of incredibly hot portrayal as fourth well 1 of <UNK> he offensive who <UNK> all big conflict time very movies <UNK> geek nonexistent like it is him driving to t
 Class:  Negative
Predicted Sentiment: Positive


In [29]:
import random
sample_review_text = " The movie was bad didn't like it "

print(f"Sample Review Text: \n{sample_review_text}\n")

# Prepare the sample review as a batch of one
sample_tensor = tf.convert_to_tensor([sample_review_text], dtype=tf.string)

# Predict the sentiment of the sample review
sample_prediction = model_3.predict(sample_tensor)
predicted_class = 'Positive' if sample_prediction[0] > 0.5 else 'Negative'

print(f"Predicted Sentiment: {predicted_class}")


Sample Review Text: 
 The movie was bad didn't like it 

Predicted Sentiment: Negative
