## Imports

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding

from sklearn.model_selection import train_test_split

## Download IMDB dataset

In [None]:
dataset, info = tfds.load('imdb_reviews', split='train', with_info=True)
df = tfds.as_dataframe(dataset.take(-1), info)

df.head()

## Hyperparameters

In [None]:
VOCAB_SIZE = 5000  # model can learn VOCAB_SIZE (vocabulary size) number of words
SEQ_LEN = 100  # model can take a maximum of SEQ_LEN (sequence length) number of words in a single sentence
EMB_DIM = 50  # Number of dimensions to map the words to vectors
EPOCHS = 10  # number of epochs for the model to train
BATCH_SIZE = 32  # number of samples to process at once

## Preprocess Dataset

In [None]:
labels = df['label'].values
reviews = [str(text) for text in df['text']]

tk = Tokenizer(num_words=VOCAB_SIZE)
tk.fit_on_texts(reviews)  # map words to numbers

sequences = tk.texts_to_sequences(reviews)  # convert words to numbers
x = pad_sequences(sequences, maxlen=SEQ_LEN, padding='post')  # pad samples to have equal length of SEQ_LEN
y = labels.reshape(-1, 1)  # reshape from 1D array to 2D array

## Split train and validation data

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

## Build model

In [None]:
model = Sequential()

model.add(Embedding(VOCAB_SIZE, EMB_DIM, input_length=SEQ_LEN))
model.add(LSTM(64))
model.add(Dense(32, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

## Train model

In [None]:
model.fit(x_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data=(x_val, y_val))

## Preprocess test dataset

In [None]:
test_dataset, info = tfds.load('imdb_reviews', split='test', with_info=True)
test_df = tfds.as_dataframe(test_dataset.take(-1), info)

test_labels = test_df['label'].values
test_reviews = [str(text) for text in test_df['text']]

test_sequences = tk.texts_to_sequences(test_reviews)  # convert words to numbers
x_test = pad_sequences(test_sequences, maxlen=SEQ_LEN, padding='post')  # pad samples to have equal length of SEQ_LEN
y_test = test_labels.reshape(-1, 1)  # reshape from 1D array to 2D array

x_test.shape, y_test.shape

## Evaluate model on test dataset

In [None]:
loss, accuracy = model.evaluate(x_test, y_test)
print("Test loss: {} | Test Accuracy: {}%".format(loss, accuracy * 100))

## Predict on custom text

In [None]:
while True:
    user_text = input("\nEnter your text here; q to quit : ")
    if user_text == 'q':
        break

    user_sequence = tk.texts_to_sequences([user_text])
    user_sequence = pad_sequences(user_sequence, maxlen=SEQ_LEN, padding='post')

    prediction = model.predict(user_sequence)[0][0]

    if prediction >= 0.5:
        sentiment = 'POSITIVE'
    else:
        sentiment = 'NEGATIVE'

    print('Sentiment: {} | Score: {}'.format(sentiment, prediction))