# Text classification IMDb

Found it here:
https://keras.io/examples/nlp/text_classification_from_scratch/

Dataset:
https://ai.stanford.edu/~amaas/data/sentiment/

In [2]:
import tensorflow as tf
import numpy as np

### Gathering the data and spliting it

In [4]:
batch_size = 32

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=1337,
)

raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=1337,
)

raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size
)

print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")
print(f"Number of batches in raw_test_ds: {raw_test_ds.cardinality()}")


Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
Number of batches in raw_train_ds: 625
Number of batches in raw_val_ds: 157
Number of batches in raw_test_ds: 782


### Some samples

In [5]:
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(5):
        print("text: " + str(text_batch.numpy()[i][:150]) + "...")
        print("label: " + str(label_batch.numpy()[i]) + "\n")


text: b"I've seen tons of science fiction from the 70s; some horrendously bad, and others thought provoking and truly frightening. Soylent Green fits into the"...
label: 1

text: b"First than anything, I'm not going to praise I\xc3\xb1arritu's short film, even I'm Mexican and proud of his success in mainstream Hollywood.<br /><br />In "...
label: 1

text: b'Blood Castle (aka Scream of the Demon Lover, Altar of Blood, Ivanna--the best, but least exploitation cinema-sounding title, and so on) is a very trad'...
label: 1

text: b'I was talked into watching this movie by a friend who blubbered on about what a cute story this was.<br /><br />Yuck.<br /><br />I want my two hours b'...
label: 0

text: b'Michelle Rodriguez is the defining actress who could be the charging force for other actresses to look out for. She has the audacity to place herself '...
label: 1



### Preparing the data
bye tags 👋👋

In [6]:
from tensorflow.keras.layers import TextVectorization
import string
import re


def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )


# constants
max_features = 20000
embedding_dim = 128
sequence_length = 500


vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)


text_ds = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

### Vectorization

In [15]:
from tensorflow.keras import layers

text_tensor = tf.keras.Input(shape=(1,), dtype=tf.string, name='text')
x1 = vectorize_layer(text_tensor)
x2 = layers.Embedding(max_features +1, embedding_dim)(x1)

print('text_tensor', text_tensor)
print('\nx1', x1)
print('\nx2', x2)


text_tensor KerasTensor(type_spec=TensorSpec(shape=(None, 1), dtype=tf.string, name='text'), name='text', description="created by layer 'text'")

x1 KerasTensor(type_spec=TensorSpec(shape=(None, 500), dtype=tf.int64, name=None), name='text_vectorization/RaggedToTensor/RaggedTensorToTensor:0', description="created by layer 'text_vectorization'")

x2 KerasTensor(type_spec=TensorSpec(shape=(None, 500, 128), dtype=tf.float32, name=None), name='embedding_2/embedding_lookup/Identity_1:0', description="created by layer 'embedding_2'")


In [32]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

print('train_ds', train_ds)
#print('val_ds', val_ds)
#print('test_ds', test_ds)

# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

print('train_ds', train_ds)
#print('val_ds', val_ds)
#print('test_ds', test_ds)


train_ds <MapDataset shapes: ((None, 500), (None,)), types: (tf.int64, tf.int32)>
train_ds <PrefetchDataset shapes: ((None, 500), (None,)), types: (tf.int64, tf.int32)>


### Making a model

In [46]:
inputs = tf.keras.Input(shape=(None,), dtype="int64")

x1 = layers.Embedding(max_features, embedding_dim)(inputs)
x2 = layers.Dropout(0.5)(x1)

print('x1', x1) 
print('x2', x2)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x2)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)


# We add a vanilla hidden layer:
x = layers.Dense(256, activation="relu")(x)
x = layers.Dropout(0.5)(x)


# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])


x1 KerasTensor(type_spec=TensorSpec(shape=(None, None, 128), dtype=tf.float32, name=None), name='embedding_9/embedding_lookup/Identity_1:0', description="created by layer 'embedding_9'")
x2 KerasTensor(type_spec=TensorSpec(shape=(None, None, 128), dtype=tf.float32, name=None), name='dropout_8/Identity:0', description="created by layer 'dropout_8'")


### Training

In [None]:
epochs = 3

# Fit the model using the train and test datasets.
model.fit(train_ds, validation_data=val_ds, epochs=epochs)


Epoch 1/3
Epoch 2/3
Epoch 3/3

In [41]:
model.evaluate(test_ds)



[0.4653022587299347, 0.8500000238418579]

### End to end
This just uses raw strings

In [45]:
# A string input
inputs = tf.keras.Input(shape=(1,), dtype="string")
# Turn strings into vocab indices
indices = vectorize_layer(inputs)
# Turn vocab indices into predictions
outputs = model(indices)

# Our end to end model
end_to_end_model = tf.keras.Model(inputs, outputs)
end_to_end_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
)

# Test it with `raw_test_ds`, which yields raw strings
end_to_end_model.evaluate(raw_test_ds)



[0.46530210971832275, 0.8500000238418579]