In [None]:
# From Tensorflow Authors

# Simple sentence encoding
# Notes : Remember tokenizer strips punctuation so I is `i`.
# Do necessary imports
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




In [None]:
# Sentences to encode
# The exclamation doesn't impact the encoding, check that out too.
sentences = ["I love my dog", "I love my cat", "you love my dog!", "Do you think my dog si amazing?"]

In [None]:
# Instance of Tokenizer - take top 100 words by volume and just encode those from the available dataset, this hyperparameter can be changed as per requirement.

# Added OOV token to recognize words that are out of vocabulary

tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")

# This method takes in the data i.e sentences and then encodes it
tokenizer.fit_on_texts(sentences)
print(tokenizer)


In [None]:
# TOkenixer provides word index which returns dictionary of key-value pairs where key=word and value=token
word_index = tokenizer.word_index
print(word_index)
print(word_index['my'])

In [None]:

# Turns into set of sequences
sequences = tokenizer.texts_to_sequences(sentences)

# Generates a list of sentences that have been encoded into integer lists
# Tokens replace the words - I love my dog becomes [x, y, z, w]
print(sequences)

# Test the fit , you may notice that the words that weren't encoded did not generate and were excluded from the results. When you add OOV then it includes the index for OOV
test_data = ["I really love my dog", "my dog loves my manatee"]
test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

# Pass sequences for padding. It pads out in a form of matrix where each row represents a sentence wherein row size is the size of the largest sentence often.

# Padding post adds padding after the sentence.

# Use maxlen parameter to determine how long your sentence should be in terms of words. You will loose sentences if you set maxlen lower, with padding generally being pre you will loose from the beginning of the sentence, so to override this use the truncating parameter as `truncating=post`
padded = pad_sequences(sequences, padding='post', truncating='post', maxlen=5)
print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:")
print(padded)