In [45]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model

In [46]:
# The data is loaded and preprocessed by the load_dataset function in cell BNRR3WMjHxMi.
# This cell is not needed.
# data = pd.read_csv('/content/IMDB Dataset.csv')
# print(data)

In [47]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [48]:
english_stops = set(stopwords.words('english'))

In [49]:
def load_dataset():
    df = pd.read_csv('/content/IMDB Dataset.csv')
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


  y_data = y_data.replace('negative', 0)


In [50]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
38992    [as, teenager, i, pretty, whole, bigfoot, thin...
27259    [as, much, i, like, walter, matthau, i, felt, ...
37267    [soon, americans, would, swarm, darkened, dama...
27027    [this, perfect, example, mainstream, horror, c...
2346     [water, lilies, well, made, first, film, franc...
                               ...                        
37205    [i, knew, going, awful, awful, one, boring, mo...
46866    [plot, movie, revolves, around, submarine, bui...
21456    [this, minute, documentary, bu, uel, made, ear...
38939    [i, remember, seeing, early, uk, tv, hooked, t...
48278    [elvis, presley, plays, half, breed, native, a...
Name: review, Length: 40000, dtype: object 

29282    [this, movie, starts, somewhat, slowly, gets, ...
6082     [the, worst, movie, i, seen, quite, interestin...
12994    [the, supposed, writer, director, mr, dhawan, ...
42046    [the, plot, movie, dumb, bag, hair, jimmy, smi...
23303    [jealous, husband, holds, car, dealership, hos...
 

In [51]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [52]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[  108  2070     1 ...  2095  8310    40]
 [  108    17     1 ...     0     0     0]
 [  442  1379    12 ... 57021  2088  4652]
 ...
 [    8   706   525 ...     0     0     0]
 [    1   292   223 ...     0     0     0]
 [ 4106 12387   206 ...     0     0     0]] 

Encoded X Test
 [[   8    3  435 ...    0    0    0]
 [   2  157    3 ...    0    0    0]
 [   2  345  458 ...    0    0    0]
 ...
 [  50  900  684 ... 8131  615  169]
 [ 407  285  133 ...    0    0    0]
 [   2   65  698 ...    0    0    0]] 

Maximum review length:  130


In [53]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())



None


In [54]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [55]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 246ms/step - accuracy: 0.5505 - loss: 0.6717
Epoch 1: accuracy improved from -inf to 0.59915, saving model to models/LSTM.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 247ms/step - accuracy: 0.5506 - loss: 0.6716
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 249ms/step - accuracy: 0.6482 - loss: 0.6240
Epoch 2: accuracy improved from 0.59915 to 0.66712, saving model to models/LSTM.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 249ms/step - accuracy: 0.6482 - loss: 0.6239
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 250ms/step - accuracy: 0.7745 - loss: 0.5257
Epoch 3: accuracy improved from 0.66712 to 0.76232, saving model to models/LSTM.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 251ms/step - accuracy: 0.7745 - loss: 0.5258
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 247ms/step - accuracy: 0.6776 - loss: 0.6057
Epoch 4: accuracy did not improve from 0.76232
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 247ms/step - accuracy: 0.6776 - loss: 0.6058
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 248ms/step - accuracy: 0.6946 - loss: 0.5908
Epoch 5: accuracy did not improve from 0.76232
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 248ms/step - accuracy: 0.6946 - loss: 0.5907


<keras.src.callbacks.history.History at 0x77fb76c08ad0>

In [56]:
import numpy as np

# Get prediction probabilities
y_pred_prob = model.predict(x_test, batch_size=128)

# Convert probabilities to class labels (0 or 1)
y_pred = (y_pred_prob > 0.5).astype("int32")

true = np.sum(y_pred.reshape(-1) == y_test)

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {:.2f}%'.format(true / len(y_pred) * 100))


[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 91ms/step
Correct Prediction: 7968
Wrong Prediction: 2032
Accuracy: 79.68%


In [57]:
loaded_model = load_model('models/LSTM.h5')



In [58]:
review = str(input('Movie Review: '))

Movie Review: The movie is very nice and good


In [59]:
# Pre-process input
import re

regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:  The movie is very nice and good
Filtered:  ['the movie nice good']


In [60]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[  2   3 241   9   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]]


In [61]:
result = loaded_model.predict(tokenize_words)
print(result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238ms/step
[[0.9186909]]


In [62]:
if result >= 0.7:
    print('positive')
else:
    print('negative')

positive


In [63]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True