In [1]:
import tensorflow as tf
print(tf.__version__)

2.18.0


In [2]:
from tensorflow.keras.layers import TextVectorization

In [3]:
# tiny dataset
sentences = [
    "I like egg and ham.",
    "I love chocolate and bunnies.",
    "I hate onions."
]

In [4]:
MAX_VOCAP_SIZE = 20_000

In [5]:
vectorization_layer = TextVectorization(
    max_tokens=MAX_VOCAP_SIZE,
    # standardize="lower_and_strip_punctuation",
    # split="whitespace",
    # output_mode="int",

)

In [6]:
vectorization_layer.adapt(sentences)

In [7]:
sequences = vectorization_layer(sentences)
print(sequences)

tf.Tensor(
[[ 2  6  9  3  8]
 [ 2  5 10  3 11]
 [ 2  7  4  0  0]], shape=(3, 5), dtype=int64)


In [8]:
vectorization_layer.get_vocabulary()

['',
 '[UNK]',
 'i',
 'and',
 'onions',
 'love',
 'like',
 'hate',
 'ham',
 'egg',
 'chocolate',
 'bunnies']

In [9]:
#How do we get the word-to-index mapping?
word2idx = {v: k for k, v in enumerate(vectorization_layer.get_vocabulary())}
print(word2idx)

{'': 0, '[UNK]': 1, 'i': 2, 'and': 3, 'onions': 4, 'love': 5, 'like': 6, 'hate': 7, 'ham': 8, 'egg': 9, 'chocolate': 10, 'bunnies': 11}


In [13]:
# truncation
vectorization_layer_truncated = TextVectorization(
    max_tokens=MAX_VOCAP_SIZE,
    output_sequence_length=3
)

# fit
vectorization_layer_truncated.adapt(sentences)

# vectorize
sequences_truncated = vectorization_layer_truncated(sentences)
print(sequences_truncated)

tf.Tensor(
[[ 2  6  9]
 [ 2  5 10]
 [ 2  7  4]], shape=(3, 3), dtype=int64)


In [14]:
# ragged (no padding) (TF backend only)
vectorization_layer_ragged = TextVectorization(
    max_tokens=MAX_VOCAP_SIZE,
    ragged=True
)

# fit
vectorization_layer_ragged.adapt(sentences)

# predict
sequences_ragged = vectorization_layer_ragged(sentences)
print(sequences_ragged)


<tf.RaggedTensor [[2, 6, 9, 3, 8], [2, 5, 10, 3, 11], [2, 7, 4]]>


In [15]:
sequences_ragged.to_list()

[[2, 6, 9, 3, 8], [2, 5, 10, 3, 11], [2, 7, 4]]

In [18]:
# pad at front instead of back
# not supported in Text Vectorization layer itself
from tensorflow.keras.utils import pad_sequences

# defaults:
# tf.keras.utils.pad_sequences(
    # sequences,
    # maxlen = None,
    # dtype = 'int32',
    # padding = 'pre',
    # truncating = 'pre',
    # value = 0.0
# )

padded = pad_sequences(sequences_ragged.to_list())
print(padded)

[[ 2  6  9  3  8]
 [ 2  5 10  3 11]
 [ 0  0  2  7  4]]
