In [1]:
import tensorflow

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
sentences = [
    "I love my dog",
    "I love my cat",
    "You love my dog!",
    "Do you think my dog is amazing?"
]

In [4]:
tokenizer = Tokenizer(num_words=9, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
tokenizer.word_index

{'<OOV>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'you': 6,
 'cat': 7,
 'do': 8,
 'think': 9,
 'is': 10,
 'amazing': 11}

In [5]:
tokenizer.word_counts

OrderedDict([('i', 2),
             ('love', 3),
             ('my', 4),
             ('dog', 3),
             ('cat', 1),
             ('you', 2),
             ('do', 1),
             ('think', 1),
             ('is', 1),
             ('amazing', 1)])

In [6]:
tokenizer.document_count

4

In [7]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 1, 2, 4, 1, 1]]

In [8]:
padded = pad_sequences(sequences)
padded

array([[0, 0, 0, 5, 3, 2, 4],
       [0, 0, 0, 5, 3, 2, 7],
       [0, 0, 0, 6, 3, 2, 4],
       [8, 6, 1, 2, 4, 1, 1]])

In [9]:
padded = pad_sequences(sequences, maxlen=5)
padded

array([[0, 5, 3, 2, 4],
       [0, 5, 3, 2, 7],
       [0, 6, 3, 2, 4],
       [1, 2, 4, 1, 1]])

In [10]:
padded = pad_sequences(sequences, maxlen=5, padding="post")
padded

array([[5, 3, 2, 4, 0],
       [5, 3, 2, 7, 0],
       [6, 3, 2, 4, 0],
       [1, 2, 4, 1, 1]])

In [11]:
padded = pad_sequences(sequences, maxlen=5, padding="post", truncating="post")
padded

array([[5, 3, 2, 4, 0],
       [5, 3, 2, 7, 0],
       [6, 3, 2, 4, 0],
       [8, 6, 1, 2, 4]])