In [12]:
from pandas import read_csv
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, Embedding, LSTM
from tensorflow.keras import Input
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import math
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.strings import regex_replace
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

## Split the Data

In [13]:
train_data = text_dataset_from_directory("./movie-reviews-dataset/train")
test_data = text_dataset_from_directory("./movie-reviews-dataset/test")


# Prepare data remove <br> tags

def prepareData(dir):
    data = text_dataset_from_directory(dir)
    return data.map(
        lambda text, label: (regex_replace(text, '<br />', ' '), label)
    )

train_data = prepareData("./movie-reviews-dataset/train")
test_data = prepareData("./movie-reviews-dataset/test")

for text_batch, label_bath in  train_data.take(1):
    print(text_batch.numpy()[0])
    print(label_bath.numpy()[0])



Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.
b'Witty. Quirky. Genuine. Surreal. Butterfly wings? One could ask what all of these words best describe, and some (those in fuse with the international film community) may quickly say Happenstance, but others may jump aboard the more American train and immediately yell, The Butterfly Effect. Strangely, I would be one of those screaming for that sci-fi Kutcher film mainly because none of those words that I initially mentioned at the start of this paragraph accurately depicts the Tautou feature that I witnessed. Sure, we all loved her in Amelie and thought she was the daughter of Jesus in The Da Vinci Code, but in this film first-time director (of a feature film at least) Laurent Firode doesn\'t give Tautou the opportunity to shine. Sadly, he gives nobody the opportunity to really demonstrate themselves because he is too d

In [14]:
model = Sequential()
model.add(Input(shape=(1,), dtype="string"))

### Text Vectorization

Our first layer will be `TV` which will convert of string input to a sequence of integers, each one representing a token.

In [15]:
max_tokens = 1000
max_len = 100

vectorize_layer = TextVectorization(
     # Max vocab size. Any words outside of the max_tokens most common ones
     # will be treated the same way: as "out of vocabulary" (OOV) tokens.
     max_tokens = max_tokens,
     output_mode="int",
     output_sequence_length=max_len
)

train_texts = train_data.map(lambda text, label: text)

vectorize_layer.adapt(train_texts)

model.add(vectorize_layer)

### Embedding

Our next layer will be the Embedding layer, which will convert / turn the integers produced by the previous layers into fixed-length vectors.

In [16]:
# Note that we're using max_tokens + 1 here, since there's an
# out-of-vocabulary (OOV) token that gets added to the vocab.
model.add(Embedding(max_tokens + 1, 128))

### The Recurrent Layer

 64 is the "units" parameter, which is the
 dimensionality of the output space.

In [17]:
model.add(LSTM(64))

### Wrapping Up

To finish off our network, we’ll add a standard fully-connected (Dense) layer and an output layer with sigmoid activation:

The sigmoid activation outputs a number between 0 and 1, which is perfect for our problem - 0 represents a negative review, and 1 represents a positive one.



In [18]:
model.add(Dense(64, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

### Compiling the model

Before we can begin training, we need to configure the training process. We decide a few key factors during the compilation step, including:

 * `The optimizer`: We’ll stick with a pretty good default: the `Adam gradient-based optimizer`. Keras has many other optimizers you can look into as well. Optimizers are algorithms or methods used to change the attributes of your neural network such as weights and learning rate in order to reduce the losses. How you should change your weights or learning rates of your neural network to reduce the losses is defined by the optimizers you use.

 * `The loss function`: Since we only have 2 output classes (positive and negative), we’ll use the `Binary Cross-Entropy loss`. A loss function measures how different the predicted output is versus the expected output. For binary classification problems, we use binary cross entropy as loss function. `Epochs` is the number of times the whole training data is used to train the model.

 * `A list of metrics`: Since this is a classification problem, we’ll just have Keras report on the accuracy metric.


In [19]:
model.compile(
  optimizer='adam',
  loss='binary_crossentropy',
  metrics=['accuracy'],
)

### Train the data

In [20]:
model.fit(train_data, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1352063b0>

### Save Model

In [22]:
model.save_weights('cnn_h5_tf', save_format='tf')

### Reload Model

In [None]:
model.load_weights("cnn_h5_tf")

In [23]:
# Should print a very high score like 0.98.
print(model.predict([
  "i loved it! highly recommend it to anyone and everyone looking for a great movie to watch.",
]))

# Should print a very low score like 0.01.
print(model.predict([
  "this was awful! i hated it so much, nobody should watch this. the acting was terrible, the music was terrible, overall it was just bad.",
]))

[[0.9807688]]
[[0.00792643]]


### Improving our Network

 * **Network Depth**
   What happens if we add Recurrent layers? How does that affect training and/or the model’s final performance?
   ```
    model = Sequential()

    # ...

    # Return the full sequence instead of just the last
    # output of the sequence.
    model.add(LSTM(64, return_sequences=True))

    # This second recurrent layer's input sequence is the
    # output sequence of the previous layer.
    model.add(LSTM(64))

   ```
 * **Dropout**
   What if we incorporated dropout (e.g. via Dropout layers), which is commonly used to prevent overfitting?
   ```
   from tensorflow.keras.layers import Dropout

    model = Sequential()

    # ...

    # Examples of common ways to use dropout below. These
    # parameters are not necessarily the most optimal.
    model.add(LSTM(64, dropout=0.25, recurrent_dropout=0.25))

    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.5))
   ```
 * **Adjusting TV parameters**

 * **Pre-processing**
   All we did to clean our dataset was remove <br /> markers. There may be other pre-processing steps that would be useful to us. For example:

    *  Removing “useless” tokens (e.g. ones that are extremely common or otherwise not useful)
    
    *  Fixing common mispellings / abbreviations and standardizing slang