In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf
from tensorflow import keras
import numpy as np

In [2]:
train_ds, val_ds = keras.utils.text_dataset_from_directory(
    './aclImdb/train/',
    validation_split= 0.2,
    subset='both',
    seed=225,
)

test_ds = keras.utils.text_dataset_from_directory(
    './aclImdb/test/',
    seed=225,
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [3]:
max_tokens = 25000
max_length = 600

text_vectorizer = keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode='int',
    output_sequence_length=max_length,
)

text_only_train_ds = train_ds.map(lambda x, y: x)

text_vectorizer.adapt(text_only_train_ds)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [4]:
int_train_ds = train_ds.map(lambda x, y: (text_vectorizer(x), y), num_parallel_calls=tf.data.AUTOTUNE)
int_val_ds = val_ds.map(lambda x, y: (text_vectorizer(x), y), num_parallel_calls=tf.data.AUTOTUNE)
int_test_ds = test_ds.map(lambda x, y: (text_vectorizer(x), y), num_parallel_calls=tf.data.AUTOTUNE)

In [5]:
inputs = keras.Input(shape=(None,), dtype='int64')

embeded = keras.layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs)

x = keras.layers.Bidirectional(keras.layers.LSTM(32, return_sequences=True))(embeded)
x = keras.layers.Bidirectional(keras.layers.LSTM(32))(x)
x = keras.layers.Dropout(0.5)(x)

outputs = keras.layers.Dense(1, activation='sigmoid')(x)

model = keras.Model(inputs, outputs)
model.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy'],
)

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 256)         6400000   
                                                                 
 bidirectional (Bidirectiona  (None, None, 64)         73984     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               24832     
 nal)                                                            
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65    

In [7]:
model.fit(int_train_ds, validation_data=int_val_ds, batch_size=64, epochs=10, callbacks=[keras.callbacks.ModelCheckpoint('checkpoints/bilstm.h5', save_weights_only=True)])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f1445450df0>

In [6]:
model.fit(int_train_ds, validation_data=int_val_ds, batch_size=64, epochs=10, callbacks=[keras.callbacks.ModelCheckpoint('checkpoints/2-layer-bilstm.h5', save_weights_only=True)])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9c255a6670>