In [36]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


In [37]:
data = pd.read_csv("IMDB Dataset.csv")
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [38]:
# Preprocessing of data:
# Convert all to lower cases 
# Remove special characters from the comments.
# Check and remove null values

data['review'] = data['review'].str.lower()
data['review'] = data['review'].replace(r'[^0-9a-z\s]', '', regex= True)
print(data['review'])

# do the same with the sentiments:
data['sentiment'] = data['sentiment'].str.lower()
data['sentiment'] = data['sentiment'].replace(r'[^0-9a-z\s]', '', regex= True)
data['sentiment']
print(data['sentiment'])

data.shape



0        one of the other reviewers has mentioned that ...
1        a wonderful little production br br the filmin...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    i am a catholic taught in parochial elementary...
49998    im going to have to disagree with the previous...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object
0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object


(50000, 2)

In [39]:
data['sentiment_num'] = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
data['sentiment_num']

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment_num, Length: 50000, dtype: int64

In [40]:
# Time to convert the text data to numbers using 
# Tokenizer: to break sentences as token and 
# Padding: to align input vector in same size 



max_token = 5000 #Sets the maximum number of words to keep in the tokenizer
max_token_len = 200 # Defines the fixed length for each input sequence after padding

tokenizer = Tokenizer(num_words = max_token)
tokenizer.fit_on_texts(data['review'])
tokenized_features = tokenizer.texts_to_sequences(data['review'])



X = pad_sequences(tokenized_features, maxlen = max_token_len)
y = data['sentiment_num']

In [41]:
# time to perform train and test split since data is there in X and y now

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

Simple feed forward netwrok: (DNN)

In [None]:
# Model architecting:
# 1. Model architure
# 2. Model complie
# 3. Model train
# 4. Model evaluate
# 5. Model prediction
''' 
# \\results are following: 
# Test loss: 0.26617515087127686
# Test accuracy: 0.8896999955177307

"""Review: Good music, Good graphics, Ok Ok story line and actions too voilence for kids.
Sentiment: Negative
Confidence: 0.6246

Review: Greate music graphics, Super story line and actions. was easy to understand and enjoyed alot
Sentiment: Positive
Confidence: 0.8366"""

 
------------------------------------------------------------
Classification Report:
------------------------------------------------------------
              precision    recall  f1-score   support

    Negative       0.89      0.89      0.89      4961
    Positive       0.89      0.89      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

'''


model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=max_token, output_dim=128, input_length=max_token_len))
model.add(tf.keras.layers.GlobalAveragePooling1D())  # Converts 3D output to 2D for Dense layer
model.add(tf.keras.layers.Dense(units= 128, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(units= 64, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(units= 1, activation='sigmoid'))

model.summary()



LSTM Model:

In [None]:
'''
Test loss: 0.6931105852127075
Test accuracy: 0.503600001335144

Review: Good music, Good graphics, Ok Ok story line and actions too voilence for kids
Sentiment: Positive
Confidence: 0.5100

Review: Greate music graphics, Super story line and actions. was easy to understand and enjoyed alot
Sentiment: Positive
Confidence: 0.7052

------------------------------------------------------------
Classification Report:
------------------------------------------------------------
              precision    recall  f1-score   support

    Negative       0.87      0.92      0.89      4961
    Positive       0.91      0.87      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000


'''


model1 = tf.keras.models.Sequential()

model1.add(tf.keras.layers.Embedding(input_dim = max_token, output_dim=128))
model1.add(tf.keras.layers.LSTM(units = 128, return_sequences=False))
model1.add(tf.keras.layers.Dense(units = 128, activation= 'relu'))
model1.add(tf.keras.layers.Dropout(0.2))
model1.add(tf.keras.layers.Dense(units= 64, activation='relu'))
model1.add(tf.keras.layers.Dense(units = 1, activation='sigmoid'))
model1.summary()



In [43]:
# Compile:
# model.compile(optimizer= 'adam', loss='binary_crossentropy', metrics=['accuracy', 'precision', 'recall'] )

model1.compile(optimizer= 'adam', loss='binary_crossentropy', metrics=['accuracy', 'precision', 'recall'] )

In [44]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights = True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2,min_lr=0.0001, verbose=1)
]

# history= model.fit(X_train, y_train, epochs= 10, batch_size= 32, callbacks= callbacks, validation_data = (X_test, y_test))

history1= model1.fit(X_train, y_train, epochs= 10, batch_size= 32, callbacks= callbacks, validation_data = (X_test, y_test))

# score= model.evaluate(X_test, y_test, verbose=1)
score= model1.evaluate(X_test, y_test, verbose=1)

print('Test loss:', score[0])
print('Test accuracy:', score[1])

Epoch 1/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m455s[0m 357ms/step - accuracy: 0.8234 - loss: 0.3985 - precision: 0.8299 - recall: 0.8127 - val_accuracy: 0.8727 - val_loss: 0.3073 - val_precision: 0.8323 - val_recall: 0.9359 - learning_rate: 0.0010
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 304ms/step - accuracy: 0.8927 - loss: 0.2695 - precision: 0.8902 - recall: 0.8954 - val_accuracy: 0.8916 - val_loss: 0.2719 - val_precision: 0.9129 - val_recall: 0.8676 - learning_rate: 0.0010
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m406s[0m 325ms/step - accuracy: 0.9173 - loss: 0.2183 - precision: 0.9141 - recall: 0.9208 - val_accuracy: 0.8792 - val_loss: 0.2887 - val_precision: 0.9078 - val_recall: 0.8462 - learning_rate: 0.0010
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 276ms/step - accuracy: 0.9338 - loss: 0.1760 - precision: 0.9308 - recall: 0.9366
Epoch 4: ReduceLR

In [48]:
# Making predictions on new reviews
new_reviews = [
    "Good music, Good graphics, Ok Ok story line and actions too voilence for kids",
    "Greate music graphics, Super story line and actions. was easy to understand and enjoyed alot"
]

# Preprocess the new reviews
new_reviews_lower = [review.lower() for review in new_reviews]
import re
new_reviews_clean = [re.sub(r'[^0-9a-z\s]', '', review) for review in new_reviews_lower]

# Tokenize and pad the new reviews
new_reviews_tokenized = tokenizer.texts_to_sequences(new_reviews_clean)
new_reviews_padded = pad_sequences(new_reviews_tokenized, maxlen=max_token_len)

# Make predictions
# predictions = model.predict(new_reviews_padded, verbose=0)

predictions = model1.predict(new_reviews_padded, verbose=0)

# Display results
for i, review in enumerate(new_reviews):
    sentiment = "Positive" if predictions[i][0] > 0.5 else "Negative"
    confidence = predictions[i][0] if predictions[i][0] > 0.5 else 1 - predictions[i][0]
    print(f"\nReview: {review}")
    print(f"Sentiment: {sentiment}")
    print(f"Confidence: {confidence:.4f}")


Review: Good music, Good graphics, Ok Ok story line and actions too voilence for kids
Sentiment: Positive
Confidence: 0.5100

Review: Greate music graphics, Super story line and actions. was easy to understand and enjoyed alot
Sentiment: Positive
Confidence: 0.7052


In [49]:
# Model evaluation:
# Essential: Classification Report 
# Steps: 
# 1. Make prediction on test data
# 2. Give this prediction and real test values to the method
# 3. Generate the metrics, interpret it


#1. Make prediction and get te value of prediction:




# y_predition_probability = model.predict(X_test, verbose=0)
y_predition_probability = model1.predict(X_test, verbose=0)

y_prediction = (y_predition_probability > 0.5).astype(int).flatten()

# Classification Report:
print('-' * 60)
print('Classification Report:')
print('-' * 60)
print(classification_report(y_test, y_prediction, target_names= ['Negative', 'Positive']))

------------------------------------------------------------
Classification Report:
------------------------------------------------------------
              precision    recall  f1-score   support

    Negative       0.87      0.92      0.89      4961
    Positive       0.91      0.87      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [None]:
'''
┌─────────────────────────────────────────────────────────┐
│                    MODEL COMPARISON                      │
├─────────────────────────────────────────────────────────┤
│                                                          │
│  DNN (Pooling)              LSTM                        │
│  ├─ Accuracy: 89.0%        ├─ Accuracy: 89.2% ⭐      │
│  ├─ Speed: ⚡⚡⚡⚡⚡          ├─ Speed: ⚡⚡              │
│  ├─ Balanced: ✅✅          ├─ Context: ✅✅✅          │
│  └─ Simple: ✅✅✅           └─ Complex: ✅             │
│                                                          │
│  Winner: Speed            Winner: Accuracy              │
│                                                          │
└─────────────────────────────────────────────────────────┘
'''