In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split


ModuleNotFoundError: No module named 'pandas'

In [None]:
data = pd.read_csv("IMDB Dataset.csv")
data.info()
data.head()

In [None]:
# Preprocessing of data:
# Convert all to lower cases 
# Remove special characters from the comments.
# Check and remove null values

data['review'] = data['review'].str.lower()
data['review'] = data['review'].replace(r'[^0-9a-z\s]', '', regex= True)
print(data['review'])

# do the same with the sentiments:
data['sentiment'] = data['sentiment'].str.lower()
data['sentiment'] = data['sentiment'].replace(r'[^0-9a-z\s]', '', regex= True)
data['sentiment']
print(data['sentiment'])

data.shape



0        one of the other reviewers has mentioned that ...
1        a wonderful little production br br the filmin...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    i am a catholic taught in parochial elementary...
49998    im going to have to disagree with the previous...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object
0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object


(50000, 2)

In [None]:
data['sentiment_num'] = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
data['sentiment_num']

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment_num, Length: 50000, dtype: int64

In [None]:
# Time to convert the text data to numbers using 
# Tokenizer: to break sentences as token and 
# Padding: to align input vector in same size 



max_token = 5000 #Sets the maximum number of words to keep in the tokenizer
max_token_len = 200 # Defines the fixed length for each input sequence after padding

tokenizer = Tokenizer(num_words = max_token)
tokenizer.fit_on_texts(data['review'])
tokenized_features = tokenizer.texts_to_sequences(data['review'])



X = pad_sequences(tokenized_features, maxlen = max_token_len)
y = data['sentiment_num']

In [None]:
# time to perform train and test split since data is there in X and y now

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
# Model architecting:
# 1. Model architure
# 2. Model complie
# 3. Model train
# 4. Model evaluate
# 5. Model prediction

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=max_token, output_dim=128, input_length=max_token_len))
model.add(tf.keras.layers.GlobalAveragePooling1D())  # Converts 3D output to 2D for Dense layer
model.add(tf.keras.layers.Dense(units= 128, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
# model.add(tf.keras.layers.Dense(units= 64, activation='relu'))
# model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(units= 1, activation='sigmoid'))

model.summary()



In [None]:
# Compile:
model.compile(optimizer= 'adam', loss='binary_crossentropy', metrics=['accuracy', 'precision', 'recall'] )

In [None]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.0001, verbose=1)
]

history= model.fit(X_train, y_train, epochs= 10, batch_size= 32, callbacks= callbacks, validation_data = (X_test, y_test), verbose= 1)

score= model.evaluate(X_test, y_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

: 