In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    'I love my cat',
    'I love my dog',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

# Turn the words in 'sentences' into numeric tokens
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences) # Updates internal vocabulary based on a list of texts.
word_index = tokenizer.word_index

print(word_index)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}


In [2]:
# Turn sentences into data
tokenizer = Tokenizer(num_words = 100, oov_token = '<OOV>') # oov_token keeps sentences & sequences lengths equal
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

# sequences = tokenizer.texts_to_sequences(sentences)

test_data = [
    'I really love my dog!',
    'My dog loves meat'
]

test_sequences = tokenizer.texts_to_sequences(test_data)

print(test_sequences)

[[5, 1, 3, 2, 4], [2, 4, 1, 1]]


In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

sequences = tokenizer.texts_to_sequences(sentences)

# Padding sequences to make them all have the same length (TODO: search for RaggedTensor)
padded = pad_sequences(sequences)

# print(word_index)
# print(sequences)
print(padded)

[[ 0  0  0  5  3  2  7]
 [ 0  0  0  5  3  2  4]
 [ 0  0  0  6  3  2  4]
 [ 8  6  9  2  4 10 11]]


In [4]:
padded = pad_sequences(sequences, padding = 'post')

# // Truncating & specifying the max length of sequences
# padded = pad_sequences(sequences, padding = 'post', truncating='post', maxlen = 5)

print(padded)

[[ 5  3  2  7  0  0  0]
 [ 5  3  2  4  0  0  0]
 [ 6  3  2  4  0  0  0]
 [ 8  6  9  2  4 10 11]]
