In [8]:
# Basic imports
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Import TensorFlow
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPool2D, Flatten, Embedding,LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences

%matplotlib inline

# Data Preprocessing

## Setting up the Dataset Parameters

In [2]:
number_of_words = 20000
max_len = 100

## Loading the IMDB Dataset

In [3]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=number_of_words)

## Padding all seequences to be the same length

In [4]:
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

# Building a Recurrent Neural Network

## Define the Model

In [5]:
model = Sequential()

## Add the embedding layer
Layer used to create a word vector representation of all the words. Trains the word vectors in a large matrix (20,000 words). Commonly used when doing NLP.
- output_dim: number of columns

In [7]:
model.add(Embedding(input_dim=number_of_words, output_dim=128, input_shape=(X_train.shape[1], )))

## Add the LSTM Layer
Used to understand the relationships between the different elements of the sequences/words in the reviews. 
- units: 128
- activation: tanh (tangent hyperbolic)

In [9]:
model.add(LSTM(units=128, activation='tanh'))

## Add the output layer
- units: 1 output neuron (Binary output; 0=negative review, 1=positive review)
- activation: sigmoid

In [10]:
model.add(Dense(units=1, activation='sigmoid'))

## Compile the Model
optimizer= rmsprop (very common in Recurrent Neural Networks) <br>
loss = binary crossentropy (for binary classification) <br>

**sparse catergorical crossentropy is for multi-class classification**

In [11]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 128)          2560000   
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________


**Breakdown of Layers:** <br>
MOdel: Sequential - sequence of layers<br>
Embedding: Layer for word vectorizatio in a matrix composed of 128 columns.<br>
LSTM: Long Short-Term Memory network; 128 units<br>
Dense: the output layer with 1 unit for binary output.

## Train the Model
epochs= 3 (number of times model trains) 

batch_size = 128 reviews per batch

In [13]:
model.fit(X_train, y_train, epochs=3, batch_size= 128)

Epoch 1/3


2023-04-22 11:12:59.823940: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x17d627df0>

## Evaluate the Model

In [14]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f' Test Loss: {test_loss}')
print(f' Test Accuracy: {round(test_accuracy,4)}')

 Test Loss: 0.5630922913551331
 Test Accuracy: 0.8121
