# Text classification IMDb

Found it here:
https://keras.io/examples/nlp/text_classification_from_scratch/

Dataset:
https://ai.stanford.edu/~amaas/data/sentiment/

In [3]:
import tensorflow as tf
import numpy as np

### Gathering the data and spliting it

In [10]:
batch_size = 32

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=1337,
)

raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=1337,
)

raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size
)

print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")
print(f"Number of batches in raw_test_ds: {raw_test_ds.cardinality()}")


Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
Number of batches in raw_train_ds: 625
Number of batches in raw_val_ds: 157
Number of batches in raw_test_ds: 782


### Some samples

In [20]:
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(5):
        print("text: " + str(text_batch.numpy()[i][:150]) + "...")
        print("label: " + str(label_batch.numpy()[i]) + "\n")


text: b'This movie was well done in all respects. The acting is superb along with the fine audio soundtrack which I purchased because it was so moving. It is '...
label: 1

text: b'The first one was different and funny. This attempt should have never left the studio. This movie does not make you laugh. It is a weak attempt at gro'...
label: 0

text: b"This was one of the most boring movies I've ever seen\xc2\x85 I don't really know why\xc2\x85 Just your run-of-the-mill stories about guy who is about to get marr"...
label: 0

text: b'Her Deadly Rival (1995): Starring Harry Hamlin, Annie Potts, Lisa Zane, Tommy Hinkley, Susan Diol, Roma Maffia, Robert C. Treveiler, D. L. Anderson, W'...
label: 1

text: b'I saw the movie recently and really liked it. I surprised myself and cried. This movie is in the same niche genre as "Away from Her" - or even "The Bu'...
label: 1



### Preparing the data
bye tags ðŸ‘‹ðŸ‘‹

In [28]:
from tensorflow.keras.layers import TextVectorization
import string
import re


def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )


# constants
max_features = 20000
embedding_dim = 128
sequence_length = 500


vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)


text_ds = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)