In [1]:
%tensorflow_version 2.x
import tensorflow as tf

TensorFlow 2.x selected.


In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer # to split the sentence and create the tokens
from tensorflow.keras.preprocessing.sequence import pad_sequences # in order to add padding into integer coded word sequence

In [0]:
# Just a simple test
sentences = [
  'I love you.',
  'I hate this cat and that bunny.',
  'I am here for you.'
]

In [0]:
MAX_VOCAB_SIZE = 20000 # this is pretty reasonable value
tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE) 
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [10]:
print(sequences)

[[1, 3, 2], [1, 4, 5, 6, 7, 8, 9], [1, 10, 11, 12, 2]]


In [11]:
# How to get the word to index mapping?
tokenizer.word_index

{'am': 10,
 'and': 7,
 'bunny': 9,
 'cat': 6,
 'for': 12,
 'hate': 4,
 'here': 11,
 'i': 1,
 'love': 3,
 'that': 8,
 'this': 5,
 'you': 2}

In [12]:
# use the defaults
data = pad_sequences(sequences)
print(data)

[[ 0  0  0  0  1  3  2]
 [ 1  4  5  6  7  8  9]
 [ 0  0  1 10 11 12  2]]


In [19]:
# longest sequence is our maximum sequence length.
# It doesn't have to be like that, we can truncate the words
# before adding padding etc. but in this simple example, I won't be doing it.
MAX_SEQUENCE_LENGTH = max([len(s) for s in sequences]) 
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(data)

7
[[ 0  0  0  0  1  3  2]
 [ 1  4  5  6  7  8  9]
 [ 0  0  1 10 11 12  2]]


In [20]:
# what if add the padding at the end of the word?
data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH, padding = 'post')
print(data)

[[ 1  3  2  0  0  0  0]
 [ 1  4  5  6  7  8  9]
 [ 1 10 11 12  2  0  0]]


In [21]:
# what if we add too much padding?
data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH + 2)
print(data)

[[ 0  0  0  0  0  0  1  3  2]
 [ 0  0  1  4  5  6  7  8  9]
 [ 0  0  0  0  1 10 11 12  2]]


In [25]:
# padding while truncating the words
data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH - 1 ) 
print(data)

[[ 0  0  0  1  3  2]
 [ 4  5  6  7  8  9]
 [ 0  1 10 11 12  2]]


In [29]:
# truncate from the start and add the padding at the end of the word
data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH - 1, padding = 'post' )
print(data)

[[ 1  3  2  0  0  0]
 [ 4  5  6  7  8  9]
 [ 1 10 11 12  2  0]]
