In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
sentences = ["I love my dog", "I, don't have a cat", "I love my family"]

In [3]:
tokenizer = Tokenizer(num_words=100)

In [4]:
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, "don't": 5, 'have': 6, 'a': 7, 'cat': 8, 'family': 9}


In [5]:
sentences.append("I think my dog is the best dog for kids, don't you?")

In [10]:
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'i': 1, 'my': 2, 'love': 3, 'dog': 4, "don't": 5, 'have': 6, 'a': 7, 'cat': 8, 'family': 9, 'think': 10, 'is': 11, 'the': 12, 'best': 13, 'for': 14, 'kids': 15, 'you': 16}


In [11]:
sequences = tokenizer.texts_to_sequences(sentences)

In [12]:
print(sequences)

[[1, 3, 2, 4], [1, 5, 6, 7, 8], [1, 3, 2, 9], [1, 10, 2, 4, 11, 12, 13, 4, 14, 15, 5, 16]]


In [14]:
#OOV
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

In [17]:
sequences = tokenizer.texts_to_sequences(sentences)

test_data = ["I love the beach", "Don't go to deep with the dog"]
test_seq = tokenizer.texts_to_sequences(test_data)
print(tokenizer.word_index)
print(test_seq)

{'<OOV>': 1, 'i': 2, 'my': 3, 'dog': 4, 'love': 5, "don't": 6, 'have': 7, 'a': 8, 'cat': 9, 'family': 10, 'think': 11, 'is': 12, 'the': 13, 'best': 14, 'for': 15, 'kids': 16, 'you': 17}
[[2, 5, 13, 1], [6, 1, 1, 1, 1, 13, 4]]


In [18]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [20]:
#add zeros to let the sequences on the same length as the large sequence
padded = pad_sequences(sequences=sequences)
print(padded)

[[ 0  0  0  0  0  0  0  0  2  5  3  4]
 [ 0  0  0  0  0  0  0  2  6  7  8  9]
 [ 0  0  0  0  0  0  0  0  2  5  3 10]
 [ 2 11  3  4 12 13 14  4 15 16  6 17]]


In [22]:
#add zeros after if loos better
padded_post = pad_sequences(sequences=sequences, padding='post')
print(padded_post)

[[ 2  5  3  4  0  0  0  0  0  0  0  0]
 [ 2  6  7  8  9  0  0  0  0  0  0  0]
 [ 2  5  3 10  0  0  0  0  0  0  0  0]
 [ 2 11  3  4 12 13 14  4 15 16  6 17]]


In [24]:
#max length limit, truncating=post can be used
padded_post_max = pad_sequences(sequences=sequences, padding='post', maxlen=5)
print(padded_post_max)

[[ 2  5  3  4  0]
 [ 2  6  7  8  9]
 [ 2  5  3 10  0]
 [ 4 15 16  6 17]]
