<a href="https://colab.research.google.com/github/aaryaP777/Convolutional-Neural-Networks/blob/main/Tensorflow_text_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf

In [3]:
from tensorflow.keras.layers import TextVectorization

In [4]:
sentences = [
    "I am a hero.",
    "I like burgers and fries.",
    "I love movies and games.",
    "I hate monsoon."
]

In [18]:
# maximum vocabulary size

MAX_VOCAB_SIZE = 20_000

In [19]:
vectorization_layer = TextVectorization(max_tokens=MAX_VOCAB_SIZE)

In [20]:
vectorization_layer.adapt(sentences)

In [21]:
sequences = vectorization_layer(sentences)
print(sequences)

# each word of each sentence is given an integer value.
# matrix size = size of longest sentence.

tf.Tensor(
[[ 2 13 14  8  0]
 [ 2  7 12  3 11]
 [ 2  6  4  3 10]
 [ 2  9  5  0  0]], shape=(4, 5), dtype=int64)


In [22]:
vectorization_layer.get_vocabulary()

# get the list of words accroding to integer values.

['',
 '[UNK]',
 np.str_('i'),
 np.str_('and'),
 np.str_('movies'),
 np.str_('monsoon'),
 np.str_('love'),
 np.str_('like'),
 np.str_('hero'),
 np.str_('hate'),
 np.str_('games'),
 np.str_('fries'),
 np.str_('burgers'),
 np.str_('am'),
 np.str_('a')]

In [23]:
# word to index mappping

word_index = {v:k for k, v in enumerate(vectorization_layer.get_vocabulary())}
print(word_index)

{'': 0, '[UNK]': 1, np.str_('i'): 2, np.str_('and'): 3, np.str_('movies'): 4, np.str_('monsoon'): 5, np.str_('love'): 6, np.str_('like'): 7, np.str_('hero'): 8, np.str_('hate'): 9, np.str_('games'): 10, np.str_('fries'): 11, np.str_('burgers'): 12, np.str_('am'): 13, np.str_('a'): 14}


In [24]:
# truncation
vectorization_layer_truncated = TextVectorization(
    max_tokens = MAX_VOCAB_SIZE,
    output_sequence_length = 3
)

# fit
vectorization_layer_truncated.adapt(sentences)

# vectorize
sequences_truncated = vectorization_layer_truncated(sentences)
print(sequences_truncated)

tf.Tensor(
[[ 2 13 14]
 [ 2  7 12]
 [ 2  6  4]
 [ 2  9  5]], shape=(4, 3), dtype=int64)


In [25]:
# ragged (outputs can have different lengths)
vectorization_layer_ragged = TextVectorization(
    max_tokens = MAX_VOCAB_SIZE,
    ragged = True,
)

# fit
vectorization_layer_ragged.adapt(sentences)

# vectorize
sequences_ragged = vectorization_layer_ragged(sentences)
print(sequences_ragged)

<tf.RaggedTensor [[2, 13, 14, 8], [2, 7, 12, 3, 11], [2, 6, 4, 3, 10], [2, 9, 5]]>


In [28]:
# front padding
from tensorflow.keras.utils import pad_sequences

padded = pad_sequences(sequences_ragged.to_list())
print(padded)

[[ 0  2 13 14  8]
 [ 2  7 12  3 11]
 [ 2  6  4  3 10]
 [ 0  0  2  9  5]]
