In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [2]:
#load dataset
max_features = 10000
(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(num_words=max_features)

In [3]:
index = keras.datasets.imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()])
x_train_decoded = [" ".join( [reverse_index.get(i - 3, "#") for i in item]) for item in x_train]
x_test_decoded = [" ".join( [reverse_index.get(i - 3, "#") for i in item]) for item in x_test]

In [4]:
#preprocess dataset
#tokenizer = Tokenizer(num_words=max_features)

#no need because already tokenized
# tokenizer.fit_on_texts(x_train)
# x_train = tokenizer.texts_to_sequences(x_train)
# x_test = tokenizer.texts_to_sequences(x_test)

#add pad sequences
max_len = 200
x_train = pad_sequences(x_train, padding='post', maxlen=max_len)
x_test = pad_sequences(x_test, padding='post', maxlen=max_len)

In [5]:
#build the model
model = keras.Sequential([
    keras.layers.Embedding(max_features, 128, input_length=max_len),
    keras.layers.Conv1D(64, 5, activation='relu'),
    keras.layers.GlobalMaxPooling1D(),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1, activation='sigmoid')
])

2023-04-21 09:28:57.658034: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [6]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          1280000   
                                                                 
 conv1d (Conv1D)             (None, 196, 64)           41024     
                                                                 
 global_max_pooling1d (Globa  (None, 64)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                        

In [7]:
#compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [8]:
#train the model
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6be0259270>

In [9]:
#evaluate the model
model.evaluate(x_test, y_test)



[0.5728510022163391, 0.8838000297546387]

In [None]:
#lets test on test set
predicted = model.predict(x_test)

In [None]:
predicted_output = list(map(lambda x: 'Negative' if x < 0.5 else 'Positive', list(predicted)))
test_index = 11
print(f'Testing on: {x_test_decoded[test_index]}')
print(f'Predicted: {predicted_output[test_index]}')
print(f'Actual: {"Negative" if y_test[test_index] == 0 else "Positive"}')

In [None]:
# Make predictions on new data
#comment and uncomment texts variable below to see results. Above is a negative review and below is positive.

#texts = ["I found this movie to be incredibly boring and uneventful. The acting was wooden and the plot was predictable. I wouldn't recommend it to anyone."]
texts = ["I absolutely loved this movie! The acting was top-notch and the story was engaging from beginning to end. I would highly recommend it to anyone."]

#The 'index' dictionary has keys that are words and values that are integer indices. By default, 
#the integer indices start from 1, so you can add 3 to each index to get the actual index used in the 
#IMDB dataset (1 is reserved for padding, 2 for the start of a sequence, and 3 for unknown words).

sequences = [np.array([(index.get(word, 0) + 3) for word in text.lower().split()]) for text in texts]
data = pad_sequences(sequences, padding='post', maxlen=max_len)

y = model.predict(data)

print(y) #closer to 0 means negative and closer to 1 means positive