In [2]:
import pandas as pd
import numpy as np

df = pd.DataFrame()
df = pd.read_csv('/content/movie_data.csv', encoding='utf-8')
df.head(5)

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [3]:
X_train = df.loc[:24999, 'review'].values
y_train = df.loc[:24999, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [4]:
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

In [5]:
# summarize size
print("Training data: ")
print(X.shape)
print(y.shape)

Training data: 
(50000,)
(50000,)


In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


tokenizer_obj = Tokenizer()
total_reviews = X_train + X_test
tokenizer_obj.fit_on_texts(total_reviews)

# pad sequences
max_length = 100 # try other options like mean

# define vocabulary size
vocab_size = len(tokenizer_obj.word_index) + 1

X_train_tokens =  tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)


X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post')

In [8]:
print(vocab_size)

125602


In [13]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers import Embedding

EMBEDDING_DIM = 100

print('Build model...')

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length))
model.add(GRU(units=32,  dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print('Summary of the built model...')
print(model.summary())

Build model...
Summary of the built model...


None


In [14]:
print('Train...')

model.fit(X_train_pad, y_train, batch_size=72, epochs=15, validation_data=(X_test_pad, y_test), verbose=2)

Train...
Epoch 1/15
348/348 - 55s - 157ms/step - accuracy: 0.6435 - loss: 1.2583 - val_accuracy: 0.6564 - val_loss: 0.6141
Epoch 2/15
348/348 - 84s - 240ms/step - accuracy: 0.7274 - loss: 0.5379 - val_accuracy: 0.6590 - val_loss: 0.5894
Epoch 3/15
348/348 - 54s - 156ms/step - accuracy: 0.7404 - loss: 76.5174 - val_accuracy: 0.6583 - val_loss: 0.6019
Epoch 4/15
348/348 - 83s - 238ms/step - accuracy: 0.7760 - loss: 0.4736 - val_accuracy: 0.6530 - val_loss: 0.6044
Epoch 5/15
348/348 - 80s - 231ms/step - accuracy: 0.8024 - loss: 1.9543 - val_accuracy: 0.6574 - val_loss: 0.6126
Epoch 6/15
348/348 - 82s - 237ms/step - accuracy: 0.8187 - loss: 0.4005 - val_accuracy: 0.6704 - val_loss: 0.6160
Epoch 7/15
348/348 - 82s - 236ms/step - accuracy: 0.8299 - loss: 0.3731 - val_accuracy: 0.6739 - val_loss: 0.6227
Epoch 8/15
348/348 - 83s - 238ms/step - accuracy: 0.8451 - loss: 0.3613 - val_accuracy: 0.6747 - val_loss: 0.6289
Epoch 9/15
348/348 - 54s - 155ms/step - accuracy: 0.8447 - loss: 24.9978 - val

<keras.src.callbacks.history.History at 0x7a1a035e80a0>

In [15]:
print('Testing...')
score, acc = model.evaluate(X_test_pad, y_test, batch_size=128)

print('Test score:', score)
print('Test accuracy:', acc)

print("Accuracy: {0:.2%}".format(acc))

Testing...
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.7171 - loss: 0.6359
Test score: 0.6688843965530396
Test accuracy: 0.6625999808311462
Accuracy: 66.26%


In [16]:
#Let us test some  samples
test_sample_1 = "This movie is fantastic! I really like it because it is so good!"
test_sample_2 = "Good movie!"
test_sample_3 = "Maybe I like this movie."
test_sample_4 = "Not to my taste, will skip and watch another movie"
test_sample_5 = "if you like action, then this movie might be good for you."
test_sample_6 = "Bad movie!"
test_sample_7 = "Not a good movie!"
test_sample_8 = "This movie really sucks! Can I get my money back please?"
test_samples = [test_sample_1, test_sample_2, test_sample_3, test_sample_4, test_sample_5, test_sample_6, test_sample_7, test_sample_8]

test_samples_tokens = tokenizer_obj.texts_to_sequences(test_samples)
test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen=max_length)

#predict
model.predict(x=test_samples_tokens_pad)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 156ms/step


array([[0.7192699 ],
       [0.6027677 ],
       [0.4443685 ],
       [0.32521054],
       [0.4584618 ],
       [0.18008453],
       [0.38027057],
       [0.13077027]], dtype=float32)

In [17]:
#let us check how the model predicts
classes = model.predict(X_test_pad[:10], batch_size=128)
for i in range (0,10):
    if(classes[i] > 0.5 and y_test[i] == 1 or (classes[i] <= 0.5 and y_test[i] == 0)):
        print( classes[i], y_test[i], " Right prdiction")
    else :
        print( classes[i], y_test[i], " Wrong prdiction")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step
[0.04389637] 1  Wrong prdiction
[0.71267265] 1  Right prdiction
[0.56838304] 1  Right prdiction
[0.6184453] 1  Right prdiction
[0.84978336] 1  Right prdiction
[0.6742373] 1  Right prdiction
[0.32052025] 1  Wrong prdiction
[0.6642556] 1  Right prdiction
[0.7368137] 1  Right prdiction
[0.6172058] 1  Right prdiction


In [19]:
from keras.datasets import imdb
from keras.models import Sequential
from tensorflow.keras.preprocessing import sequence
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers import Embedding

# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

print('Build model...')

model = Sequential()
model.add(Embedding(top_words, 100, input_length=max_words))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Build model...


None


In [20]:
print('Train...')

model.fit(X_train, y_train, batch_size=72, epochs=15, validation_data=(X_test, y_test), verbose=2)

Train...
Epoch 1/15
348/348 - 87s - 250ms/step - accuracy: 0.7567 - loss: 0.5074 - val_accuracy: 0.8281 - val_loss: 0.3921
Epoch 2/15
348/348 - 81s - 233ms/step - accuracy: 0.8422 - loss: 0.3767 - val_accuracy: 0.8345 - val_loss: 0.3820
Epoch 3/15
348/348 - 81s - 232ms/step - accuracy: 0.8551 - loss: 0.3495 - val_accuracy: 0.8482 - val_loss: 0.3629
Epoch 4/15
348/348 - 83s - 237ms/step - accuracy: 0.8707 - loss: 0.3224 - val_accuracy: 0.8526 - val_loss: 0.3569
Epoch 5/15
348/348 - 80s - 230ms/step - accuracy: 0.8823 - loss: 0.2963 - val_accuracy: 0.8513 - val_loss: 0.3677
Epoch 6/15
348/348 - 82s - 236ms/step - accuracy: 0.8841 - loss: 0.2878 - val_accuracy: 0.8549 - val_loss: 0.3631
Epoch 7/15
348/348 - 81s - 232ms/step - accuracy: 0.8979 - loss: 0.2611 - val_accuracy: 0.8612 - val_loss: 0.3536
Epoch 8/15
348/348 - 82s - 237ms/step - accuracy: 0.9042 - loss: 0.2444 - val_accuracy: 0.8478 - val_loss: 0.3882
Epoch 9/15
348/348 - 80s - 229ms/step - accuracy: 0.9086 - loss: 0.2355 - val_a

<keras.src.callbacks.history.History at 0x7a19fc16aa10>

In [21]:
score, acc = model.evaluate(X_test, y_test, batch_size=128)

print('Test score:', score)
print('Test accuracy:', acc)
print("Accuracy: %.2f%%" % (acc*100))

[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 115ms/step - accuracy: 0.8442 - loss: 0.5151
Test score: 0.5048733353614807
Test accuracy: 0.8458799719810486
Accuracy: 84.59%


The time to train a GRU is less than LSTM network.