In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
sentences = [
    "I love my family",
    "I love my laptop",
    "you love laptop!",
    "Do you think your laptop is nice?"
]

In [3]:
tokenizer = Tokenizer(num_words=10, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

In [4]:
tokenizer.word_index

{'<OOV>': 1,
 'love': 2,
 'laptop': 3,
 'i': 4,
 'my': 5,
 'you': 6,
 'family': 7,
 'do': 8,
 'think': 9,
 'your': 10,
 'is': 11,
 'nice': 12}

In [5]:
tokenizer.word_counts

OrderedDict([('i', 2),
             ('love', 3),
             ('my', 2),
             ('family', 1),
             ('laptop', 3),
             ('you', 2),
             ('do', 1),
             ('think', 1),
             ('your', 1),
             ('is', 1),
             ('nice', 1)])

In [6]:
tokenizer.word_docs

defaultdict(int,
            {'love': 3,
             'family': 1,
             'my': 2,
             'i': 2,
             'laptop': 3,
             'you': 2,
             'think': 1,
             'nice': 1,
             'is': 1,
             'do': 1,
             'your': 1})

In [7]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[4, 2, 5, 7], [4, 2, 5, 3], [6, 2, 3], [8, 6, 9, 1, 3, 1, 1]]

In [8]:
padded = pad_sequences(sequences, maxlen=5)
padded

array([[0, 4, 2, 5, 7],
       [0, 4, 2, 5, 3],
       [0, 0, 6, 2, 3],
       [9, 1, 3, 1, 1]])

In [9]:
padded = pad_sequences(sequences, maxlen=5, padding="post")
padded

array([[4, 2, 5, 7, 0],
       [4, 2, 5, 3, 0],
       [6, 2, 3, 0, 0],
       [9, 1, 3, 1, 1]])

In [10]:
padded = pad_sequences(sequences, maxlen=5, padding="post", truncating="post")
padded

array([[4, 2, 5, 7, 0],
       [4, 2, 5, 3, 0],
       [6, 2, 3, 0, 0],
       [8, 6, 9, 1, 3]])

In [11]:
padded = pad_sequences(sequences, maxlen=5)
padded.shape

(4, 5)

In [12]:
test_data = [
    "I really love my laptop",
    "Hi"
]

In [13]:
test_seq = tokenizer.texts_to_sequences(test_data)
test_seq

[[4, 1, 2, 5, 3], [1]]