In [32]:
import tensorflow as tf
import tensorflow.keras as keras
import pandas as pd
import numpy as np

from keras.datasets import imdb

import warnings
warnings.filterwarnings("ignore")

In [33]:
# Read csv file
data = pd.read_csv('imdb_reviews.csv')

# Reserves data for train and testing (70:30)
imdb_reviews = pd.DataFrame(np.array(data)[:17500], columns=('Reviews', 'Sentiment'))
test_reviews = pd.DataFrame(np.array(data)[17500:], columns=('Reviews', 'Sentiment'))

imdb_reviews.head()

Unnamed: 0,Reviews,Sentiment
0,<START this film was just brilliant casting lo...,positive
1,<START big hair big boobs bad music and a gian...,negative
2,<START this has to be one of the worst films o...,negative
3,<START the <UNK> <UNK> at storytelling the tra...,positive
4,<START worst mistake of my life br br i picked...,negative


### 1. Preprocessing

In [34]:
# Creating a dictionary of words with associated IDs using the datasets from keras
word_index = imdb.get_word_index()

# Adding new mapping for exclusive/html words
word_index["<PAD>"] = 0
word_index["<START"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3

word_index['to']


5

In [35]:
# Function to encode review data using mapping
def review_encoder(text):
    arr = [word_index[word] for word in text]
    return arr

In [36]:
train_data, train_labels = imdb_reviews['Reviews'], imdb_reviews['Sentiment']
test_data, test_labels = test_reviews['Reviews'], test_reviews['Sentiment']


# This separates each 'Review' into an array of words (in both the train and test data)
# It excludes tokenizing the labels since we will handle that separately later
train_data = train_data.apply(lambda review:review.split())
test_data = test_data.apply(lambda review:review.split())

# This applies the review encoder on each array of words (in both the train and test data)
# This is so it encodes each word of that 'Review' row
train_data = train_data.apply(review_encoder)
test_data = test_data.apply(review_encoder)

In [37]:
# Function to encode sentiment data using mapping
def sentiment_encoder(sentiment):
    if sentiment == 'positive':
        return 1
    else:
        return 0

In [38]:
# This applies the sentiment encoder on each label (in both train and test data)
train_labels = train_labels.apply(sentiment_encoder)
test_labels = test_labels.apply(sentiment_encoder)

In [39]:
# Padding the Reviews so that short Reviews have the same size (500)
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding='post', maxlen=500)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding='post', maxlen=500)

## 2. Defining/preparing neural network architecture

In [40]:
model = keras.Sequential([keras.layers.Embedding(10000, 16, input_length=500),   # Word embedding layer
                         keras.layers.GlobalAveragePooling1D(),                 # Global average pooling layer to avoid overfitting by minimizing parameters
                         keras.layers.Dense(16, activation='relu'),             # ReLU dense layer
                         keras.layers.Dense(1, activation='sigmoid')])           # Output layer using sigmoid

In [41]:
# Using adam optimizer, binary cross entropy loss, and accuracy for metrics
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

## 3. Training model

In [42]:
# Fitting train data and then testing train
history = model.fit(train_data, train_labels, epochs=30, batch_size=512, validation_data=(test_data, test_labels))

# Accuracy of ~0.87 with current dataset

Epoch 1/30


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.5061 - loss: 0.6931 - val_accuracy: 0.5657 - val_loss: 0.6909
Epoch 2/30
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.5498 - loss: 0.6901 - val_accuracy: 0.5687 - val_loss: 0.6864
Epoch 3/30
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.5996 - loss: 0.6837 - val_accuracy: 0.5752 - val_loss: 0.6711
Epoch 4/30
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.6370 - loss: 0.6629 - val_accuracy: 0.6867 - val_loss: 0.6366
Epoch 5/30
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.6777 - loss: 0.6289 - val_accuracy: 0.7449 - val_loss: 0.6006
Epoch 6/30
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7582 - loss: 0.5886 - val_accuracy: 0.7740 - val_loss: 0.5596
Epoch 7/30
[1m35/35[0m [32m━━━━━━━━━━━━━━━

## 4. Testing model

In [43]:
# Testing accuracy
loss, accuracy = model.evaluate(test_data, test_labels)

[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 622us/step - accuracy: 0.8758 - loss: 0.2915


In [54]:
# Grabbing random user review sample
index = np.random.randint(1, 1000)
user_review = test_reviews.loc[index]
print(user_review)

Reviews      <START superb i had initially thought that giv...
Sentiment                                             positive
Name: 33, dtype: object


In [59]:
# Testing it against model
user_review = test_data[index]
user_review = np.array([user_review])
probability = model.predict(user_review)
if (probability > 0.5).astype('int32'):
    print('positive sentiment with probability of', probability[0][0])
else:
    print('negative sentiment with probability of', 1.0-probability[0][0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
positive sentiment with probability of 0.6631073
