# 1. Necessary Imports

In [1]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 2. Tokenization

In [2]:
sentences = [
'Life is so beautiful',
'Hope keeps us going',
'Let us celebrate life!'
]

tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'life': 2, 'us': 3, 'is': 4, 'so': 5, 'beautiful': 6, 'hope': 7, 'keeps': 8, 'going': 9, 'let': 10, 'celebrate': 11}


# 3. Converting text to Sequences

In [3]:
sequences = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequences, maxlen=5)
print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:")
print(padded)


Word Index =  {'<OOV>': 1, 'life': 2, 'us': 3, 'is': 4, 'so': 5, 'beautiful': 6, 'hope': 7, 'keeps': 8, 'going': 9, 'let': 10, 'celebrate': 11}

Sequences =  [[2, 4, 5, 6], [7, 8, 3, 9], [10, 3, 11, 2]]

Padded Sequences:
[[ 0  2  4  5  6]
 [ 0  7  8  3  9]
 [ 0 10  3 11  2]]


# 4. Trying out on test data

In [4]:
test_data = [
'Our life is to celebrate',
'Hoping for the best!',
'Let peace prevail everywhere'
]
test_seq = tokenizer.texts_to_sequences(test_data)
print("\nTest Sequence = ", test_seq)

padded = pad_sequences(test_seq, maxlen=10, padding='post')
print("\nPadded Test Sequence: ")
print(padded)


Test Sequence =  [[1, 2, 4, 1, 11], [1, 1, 1, 1], [10, 1, 1, 1]]

Padded Test Sequence: 
[[ 1  2  4  1 11  0  0  0  0  0]
 [ 1  1  1  1  0  0  0  0  0  0]
 [10  1  1  1  0  0  0  0  0  0]]
