# Steps in Text Preprocessing:
    1. Standardizing text
    2. Tokenizing text
    3. Indexing (convert into numerical vector)

# 11.2.3 Vocabulary Indexing

This is the step after standardising. It aims to build an index of all terms found in the data AKA **Vocabulary**

In [1]:
vocabulary = ()
for text in dataset:
    text = standardize(text)
    tokens = tokenize(text)
    for token in tokens:
        if token not in vocabulary:
            vocabulary[token] = len(vocabulary)

In [None]:
# convert this into a vector encoding:
def one_hot_encode_token(token):
    vector = np.zeros((len(vocabulary),))
    token_index = vocabulary[token]
    vector[token_index] = 1
    return vector

# 11.2.4 Using the TextVectorization Layer

## (i) Implementing from scratch usng pure python

In [6]:
import string
class Vectorizer:
    def standardize(self, text):
        text = text.lower()
        return "".join(char for char in text if char not in string.punctuation)
    
    def tokenize(self, text):
        text = self.standardize(text)
        return text.split()
    
    def make_vocabulary(self, dataset):
        self.vocabulary = {"": 0, "[UNK]": 1}
        for text in dataset:
            text = self.standardize(text)
            tokens = self.tokenize(text)
            for token in tokens:
                if token not in self.vocabulary:
                    self.vocabulary[token] = len(self.vocabulary)
        self.inverse_vocabulary = dict((v, k) for k, v in self.vocabulary.items())
    
    def encode(self, text):
        text = self.standardize(text)
        tokens = self.tokenize(text)
        return [self.vocabulary.get(token, 1) for token in tokens]
    
    def decode(self, int_sequence):
        return " ".join(
        self.inverse_vocabulary.get(i, "[UNK]") for i in int_sequence)

In [7]:
# creating a vocabulary
vectorizer = Vectorizer()
dataset = [
"I write, erase, rewrite",
"Erase again, and then",
"A poppy blooms.",
]
vectorizer.make_vocabulary(dataset)

In [8]:
# encoding and decoding sample text
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = vectorizer.encode(test_sentence)
print(encoded_sentence)

[2, 3, 5, 7, 1, 5, 6]


In [9]:
decoded_sentence = vectorizer.decode(encoded_sentence)
print(decoded_sentence)

i write rewrite and [UNK] rewrite again


## (ii) Using a TextVectorization layer

### Implementing textvectorization layer from scratch

In [None]:
import re
import string
import tensorflow as tf

def custom_standardization_fn(string_tensor):
    lowercase_string = tf.strings.lower(string_tensor)
    return tf.strings.regex_replace(lowercase_string, f"[{re.escape(string.punctuation)}]", "")

def custom_split_fn(string_tensor):
    return tf.strings.split(string_tensor)

text_vectorization = TextVectorization(
    output_mode="int",
    standardize=custom_standardization_fn,
    split=custom_split_fn,
)

### Using the keras implementation

In [12]:
from tensorflow.keras.layers import TextVectorization
text_vectorization = TextVectorization(
output_mode = "int")

In [13]:
# to index the vocabulary of a text corpus
# use the adapt method\

dataset = [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms"
]
text_vectorization.adapt(dataset)

In [14]:
# retrieve the vocabulary using get_vocabulary() method
# use this to decode sentences
text_vectorization.get_vocabulary()
# sorted by frequency

['',
 '[UNK]',
 'erase',
 'write',
 'then',
 'rewrite',
 'poppy',
 'i',
 'blooms',
 'and',
 'again',
 'a']

In [17]:
# encoding and decoding a sample sentence
vocabulary = text_vectorization.get_vocabulary()
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = text_vectorization(test_sentence)
print(encoded_sentence)
inverse_vocab = dict(enumerate(vocabulary))
decoded_sentence = " ".join(inverse_vocab[int(i)] for i in encoded_sentence)
print(decoded_sentence)

tf.Tensor([ 7  3  5  9  1  5 10], shape=(7,), dtype=int64)
i write rewrite and [UNK] rewrite again


# 11.3.1 Preparing the IMDB movie reviews data

In [20]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  2790k      0  0:00:29  0:00:29 --:--:-- 3883k


In [21]:
# removing an unrequired subdirectory
!rm -r data/aclImdb/train/unsup

In [1]:
# seeing one of the samples
!cat data/aclImdb/train/pos/4077_10.txt

I first saw this back in the early 90s on UK TV, i did like it then but i missed the chance to tape it, many years passed but the film always stuck with me and i lost hope of seeing it TV again, the main thing that stuck with me was the end, the hole castle part really touched me, its easy to watch, has a great story, great music, the list goes on and on, its OK me saying how good it is but everyone will take there own best bits away with them once they have seen it, yes the animation is top notch and beautiful to watch, it does show its age in a very few parts but that has now become part of it beauty, i am so glad it has came out on DVD as it is one of my top 10 films of all time. Buy it or rent it just see it, best viewing is at night alone with drink and food in reach so you don't have to stop the film.<br /><br />Enjoy

In [2]:
# prepare a validation set
# using 20% of training data

import os, pathlib, shutil, random
base_dir = pathlib.Path("data/aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"

for category in ("neg", "pos"):
    os.makedirs(val_dir / category)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname,
                    val_dir / category / fname)

In [5]:
# Create dataset objects
from tensorflow import keras
batch_size = 32

train_ds = keras.utils.text_dataset_from_directory(
"data/aclImdb/train/", batch_size=batch_size
)

val_ds = keras.utils.text_dataset_from_directory(
"data/aclImdb/val/", batch_size=batch_size
)

test_ds = keras.utils.text_dataset_from_directory(
"data/aclImdb/test/", batch_size=batch_size
)

Found 20000 files belonging to 2 classes.


2023-04-09 13:09:51.626944: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-09 13:09:51.636359: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [6]:
# displaying shapes and dtypes for 1st batch
for inputs, targets in train_ds:
    print("Input shape: ", inputs.shape)
    print("Inputs dtype: ", inputs.dtype)
    print("Targets shape: ", targets.shape)
    print("Targets dtype: ", targets.dtype)
    print("Inputs[0]: ", inputs[0])
    print("Targets[0]: ", targets[0])    
    break

Input shape:  (32,)
Inputs dtype:  <dtype: 'string'>
Targets shape:  (32,)
Targets dtype:  <dtype: 'int32'>
Inputs[0]:  tf.Tensor(b'The team of Merian Cooper and Ernest Schoedsack produced a documentary of 50,000 Bakhtiari people and their animals on the Summer migration to winter grazing. The basic worth of this film today is as a time capsule of a "forgotten people" and how they lived during what we in the West knew as the "roaring twenties." A more drastic contrast could not be imagined. Raging river and barefoot mountain crossings are brutally realistic and the animals that disappear under the water do in fact die. To make sure that the audience of the time believed that the story took place, a signed certificate of authenticity is offered up at the end. The version that I saw had fascinating Iranian music that can stand alone and be appreciated without the film. Having said all this, the film is probably of more value to the anthropologist than the casual viewer in search of a goo

# 11.3.2 Processing words as a set: Bag-Of-Words

In [12]:
# encode raw data
text_vectorization = keras.layers.TextVectorization(
max_tokens=20000, output_mode="multi_hot",)

# create vocabulary by adapting to train inputs
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

# prepare preprocessed versions of datasets
binary_1gram_train_ds = train_ds.map(
lambda x, y : (text_vectorization(x), y),
num_parallel_calls=4)

binary_1gram_val_ds = val_ds.map(
lambda x, y : (text_vectorization(x), y),
num_parallel_calls=4)

binary_1gram_test_ds = test_ds.map(
lambda x, y : (text_vectorization(x), y),
num_parallel_calls=4)

In [13]:
# inspecting shapes and dtypes
for inputs, targets in binary_1gram_train_ds:
    print("Input shape: ", inputs.shape)
    print("Inputs dtype: ", inputs.dtype)
    print("Targets shape: ", targets.shape)
    print("Targets dtype: ", targets.dtype)
    print("Inputs[0]: ", inputs[0])
    print("Targets[0]: ", targets[0])    
    break

Input shape:  (32, 20000)
Inputs dtype:  <dtype: 'float32'>
Targets shape:  (32,)
Targets dtype:  <dtype: 'int32'>
Inputs[0]:  tf.Tensor([1. 1. 0. ... 0. 0. 0.], shape=(20000,), dtype=float32)
Targets[0]:  tf.Tensor(0, shape=(), dtype=int32)


In [14]:
# create reuable model-building function 
from tensorflow import keras
from tensorflow.keras import layers

def get_model(max_tokens = 20000, hidden_dims = 16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dims, activation="relu")(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
    return model

In [15]:
# train and test model
model = get_model()
model.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint("models/binary_1gram.keras",
                                           save_best_only=True)
]

# call dataset object with cache 
# only do preprocessing once and store it in memory
model.fit(binary_1gram_train_ds.cache(),
         validation_data = binary_1gram_val_ds.cache(),
         epochs=5,
         callbacks=callbacks)
model = keras.models.load_model("models/binary_1gram.keras")
print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_2 (Dense)             (None, 16)                320016    
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test acc: 0.884


## Bigrams with binary encoding

In [16]:
text_vectorization = layers.TextVectorization(
    ngrams = 2,
    max_tokens = 20000,
    output_mode = "multi_hot"
)

In [17]:
text_vectorization.adapt(text_only_train_ds)

binary_2gram_train_ds = train_ds.map(
lambda x, y : (text_vectorization(x), y),
num_parallel_calls=4)

binary_2gram_val_ds = val_ds.map(
lambda x, y : (text_vectorization(x), y),
num_parallel_calls=4)

binary_2gram_test_ds = test_ds.map(
lambda x, y : (text_vectorization(x), y),
num_parallel_calls=4)

In [18]:
model = get_model()
model.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint("models/binary_2gram.keras",
                                           save_best_only=True)
]

model.fit(binary_2gram_train_ds.cache(),
         validation_data = binary_2gram_val_ds.cache(),
         epochs=5,
         callbacks=callbacks)
model = keras.models.load_model("models/binary_2gram.keras")
print(f"Test acc: {model.evaluate(binary_2gram_test_ds)[1]:.3f}")

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_4 (Dense)             (None, 16)                320016    
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test acc: 0.900


## Bigrams with TF-IDF Encoding

In [20]:
text_vectorization = layers.TextVectorization(
    ngrams = 2,
    max_tokens = 20000,
    output_mode="tf_idf"
)

In [21]:
text_vectorization.adapt(text_only_train_ds)

tfidf_2gram_train_ds = train_ds.map(
lambda x, y : (text_vectorization(x), y),
num_parallel_calls=4)

tfidf_2gram_val_ds = val_ds.map(
lambda x, y : (text_vectorization(x), y),
num_parallel_calls=4)

tfidf_2gram_test_ds = test_ds.map(
lambda x, y : (text_vectorization(x), y),
num_parallel_calls=4)

model = get_model()
model.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint("models/tfidf_2gram.keras",
                                           save_best_only=True)
]

model.fit(tfidf_2gram_train_ds.cache(),
         validation_data = tfidf_2gram_val_ds.cache(),
         epochs=5,
         callbacks=callbacks)
model = keras.models.load_model("models/tfidf_2gram.keras")
print(f"Test acc: {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}")

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_6 (Dense)             (None, 16)                320016    
                                                                 
 dropout_3 (Dropout)         (None, 16)                0         
                                                                 
 dense_7 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test acc: 0.883
