# Data
Download the IMDB reviews if its not already in the local cache.
Load from the local cache and split to train and test sets.

In [1]:
import os
import ssl
from tensorflow.keras.utils import get_file
try:
    from notebooks.data import load_imdb
except ModuleNotFoundError:
    from data import load_imdb

if (not os.environ.get('PYTHONHTTPSVERIFY', '')
        and getattr(ssl, '_create_unverified_context', None)):
    ssl._create_default_https_context = ssl._create_unverified_context

zip_file = get_file('aclImdb.zip', origin='http://mng.bz/0tIo', extract=True)
imdb_dir = zip_file.replace('.zip', '')
(train_texts, train_labels), (test_texts, test_labels) = load_imdb(imdb_dir)

print('An example from the reviews:')
print('============================')
print(f'Text : {train_texts[0][:200]}...')
print(f'Label: {train_labels[0]}')

An example from the reviews:
Text : Entertainment Tonight has been going down hill for the last few years, but as of last night (Aug 17th 2006) they reached a new low.<br /><br />In an effort to try to hype up their broadcast, they deci...
Label: neg


# Prepare Data
Initialize a `CharVectorizer`. The vectorizer will be fitted on the 
`train_texts`. We won't use any word tokenizer, so the words will be splitted
on spaces. The vocabulary will have only the characters set on the 
`characters` attribute, the PAD and the OOV token.

In [2]:
from keras_nlp import CharVectorizer

char_vectorizer = CharVectorizer(
    characters='abcdefghijklmnopqrstuvwxyz', oov_token='#')
char_vectorizer.fit_on_texts(train_texts)
# The vectorizer's number of tokens: num_chars + PAD + OOV token
print(f'Vectorizer number of tokens: {len(char_vectorizer.token2id)}')

2020-Apr-09 23:11:55 [INFO    :CharVectorizer] - Creating vocabulary.


Vectorizer number of tokens: 28


## Encode Labels
The labels are strings ('pos' / 'neg'). We will convert them to numbers.

In [3]:
from sklearn.preprocessing import label_binarize

y_train = label_binarize(train_labels, classes=['neg', 'pos'])
y_test = label_binarize(test_labels, classes=['neg', 'pos'])

## Keep a Validation Set

In [4]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, y_train, y_val = train_test_split(
    train_texts, y_train, stratify=train_labels, shuffle=True, random_state=45)

## Vectorize Data
We apply the fitted vectorizer onto the train and test texts. 
We keep `max_tokens` per text and `max_characters` per token.

**Attention**: We *must* pass the same shape `(max_tokens, max_characters)` 
when converting different text sets. If we don't, then it is almost certain 
that the results will have different numbers of columns because the sets are 
likely to have different number of tokens.

In [5]:
batch_size = 50
max_tokens, max_characters = 1000, 10
train_generator = char_vectorizer.texts_to_vectors_generator(
    train_texts,
    y_train,
    shape=(max_tokens, max_characters),
    batch_size=batch_size)

val_generator = char_vectorizer.texts_to_vectors_generator(
    val_texts,
    y_val,
    shape=(max_tokens, max_characters),
    batch_size=batch_size)

# Neural Network
Here we define a toy network for demonstration purpose.

In [6]:
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Embedding, TimeDistributed, \
    Dropout, Flatten, Dense

chars_input = Input(
    shape=(
        max_tokens,
        max_characters,
    ), name='Input', dtype='int32')
chars_embeddings = TimeDistributed(
    Embedding(
        input_dim=char_vectorizer.num_tokens,
        output_dim=20,
        input_length=max_characters,
        mask_zero=False,
        trainable=True),
    name='Embeddings')(chars_input)
x = Dropout(0.4, name='Input_Dropout')(chars_embeddings)
x = Flatten(name='Flatten')(x)
x = Dropout(0.4, name='Dropout')(x)
predictions = Dense(1, activation='sigmoid', name='Predictions')(x)
model = Model(chars_input, predictions)
model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (InputLayer)           [(None, 1000, 10)]        0         
_________________________________________________________________
Embeddings (TimeDistributed) (None, 1000, 10, 20)      560       
_________________________________________________________________
Input_Dropout (Dropout)      (None, 1000, 10, 20)      0         
_________________________________________________________________
Flatten (Flatten)            (None, 200000)            0         
_________________________________________________________________
Dropout (Dropout)            (None, 200000)            0         
_________________________________________________________________
Predictions (Dense)          (None, 1)                 200001    
Total params: 200,561
Trainable params: 200,561
Non-trainable params: 0
_______________________________________________________

## Train Model

In [7]:
train_steps = len(train_texts) // batch_size
if train_steps % batch_size:
    train_steps += 1
val_steps = len(val_texts) // batch_size
if val_steps % batch_size:
    val_steps += 1

history = model.fit(x=train_generator,
                    steps_per_epoch=train_steps,
                    validation_data=val_generator,
                    validation_steps=val_steps,
                    epochs=5)

  ...
    to  
  ['...']


  ...
    to  
  ['...']


  ...
    to  
  ['...']


  ...
    to  
  ['...']


Train for 376 steps, validate for 126 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Model Evaluation

In [None]:
import numpy as np
from sklearn.metrics import classification_report

steps = len(test_texts) // batch_size
if len(test_texts) % batch_size:
    steps += 1
test_generator = char_vectorizer.texts_to_vectors_generator(
    test_texts,
    y_test,
    shape=(max_tokens, max_characters),
    batch_size=batch_size)
predictions = model.predict(test_generator, steps=steps, verbose=1)
y_pred = np.round(predictions)
print(
    classification_report(
        y_test, y_pred, digits=4, target_names=['neg', 'pos']))

  5/500 [..............................] - ETA: 28s

## Plot Accuracy and Loss

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

f, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(12, 6))
ax1.plot(history.epoch, history.history['acc'], label='Training')
ax1.plot(history.epoch, history.history['val_acc'], label='Validation')
ax1.set_title('Accuracy')

ax2.plot(history.epoch, history.history['loss'], label='Training')
ax2.plot(history.epoch, history.history['val_loss'], label='Validation')
ax2.set_title('Loss')
ax2.set_xlabel('epochs')
ax2.legend()

plt.show()