# Import utils and getting the data

In [1]:
import io
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

Download the IMDb Dataset

In [4]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [7]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [8]:
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

# Create Train and validation Datasets

In [9]:
batch_size = 1024
seed = 123
train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2,
    subset='training', seed=seed)
val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2,
    subset='validation', seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


## Explore the data set

In [21]:
for text_batch, label in train_ds.take(1):
    for i in range(3):
        print('\n','Text:','\n', text_batch[i], '\n')
        print('Labels:', '\n', label[1])


 Text: 
 tf.Tensor(b"Paulie sounds like the most saccharine, lachrymose and sentimental garbage you could ever find, yet it's actually much better than you might expect. The daftness of the plot could so easily have set the tone for the whole film, but actually in most other departments the film is charming.<br /><br />In case you're wondering, Paulie is a parrot. Bought for a little girl with speech difficulties, Paulie becomes her best friend and goes everywhere with her. He even sits on her shoulder during speech therapy lessons, and eventually becomes a super-intelligent speaker himself. However, Paulie is sent away by the little girl's mother and he spends the rest of the film trying to get from N.Y.C to L.A to be re-united with her.<br /><br />So, why does this awful-sounding film succeed relatively well? Firstly, it boasts some interesting and impressive animatronic effects. Secondly (and far more significantly) it has the courage to embrace its ludicrous premise and tells a ge

Notice the reviews contain raw text (with punctuation and occasional HTML tags like *< br/ >*).

In [22]:
print("Label 0 corresponds to", train_ds.class_names[0])
print("Label 1 corresponds to", train_ds.class_names[1])

Label 0 corresponds to neg
Label 1 corresponds to pos


# Prepare the dataset for training

We will standardize, tokenize, and vectorize the data using the helpful [tf.keras.layers.TextVectorization](https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization) layer.

In [23]:
def custom_standardization(input_data):
    '''Standarize the input text transforming into lowecarse, stripping the text removing <br> HTML tags 

    Args:
        input_data (list or str): Input text

    Returns:
        Tensor: String processed
    '''
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),'')

In [32]:
custom_standardization('hola')
custom_standardization(['Hola', 'adios'])

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'hola', b'adios'], dtype=object)>

Next we will define a TextVectorization layer

In [33]:
max_features = 10000
sequence_length = 250

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

Next, we will call adapt to fit the state of the preprocessing layer to the dataset. This will cause the model to build an index of strings to integers.