In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [5]:
sentences = [
    'I love my dog',
    'I love my cat',
    'I love my dog!',
    'Do you think my dog amazing?'
]
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_index


{'my': 1,
 'i': 2,
 'love': 3,
 'dog': 4,
 'cat': 5,
 'do': 6,
 'you': 7,
 'think': 8,
 'amazing': 9}

In [6]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[2, 3, 1, 4], [2, 3, 1, 5], [2, 3, 1, 4], [6, 7, 8, 1, 4, 9]]

### Handling unknown words
In order to handle texts or words not seen in the training data for the neural network, we can use 'out of vocabulory', so that it is tokenized and the lenght of the sentense is same.

In [9]:
print("BEFORE USING OUT OF VOCABULORY: ")
test_data = [
    'I really like my dog',
    'My dog name is rocky'
]

test_sequences = tokenizer.texts_to_sequences(test_data)
print(test_sequences)

BEFORE USING OUT OF VOCABULORY: 
[[2, 1, 4], [1, 4]]


In [12]:
tokenizer = Tokenizer(num_words=100, oov_token = '<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)
print()
print("BEFORE USING OUT OF VOCABULORY: ")
print()
test_data = [
    'I really like my dog',
    'My dog name is rocky'
]

test_sequences = tokenizer.texts_to_sequences(test_data)
print()
print(test_sequences)

{'<OOV>': 1, 'my': 2, 'i': 3, 'love': 4, 'dog': 5, 'cat': 6, 'do': 7, 'you': 8, 'think': 9, 'amazing': 10}

BEFORE USING OUT OF VOCABULORY: 


[[3, 1, 1, 2, 5], [2, 5, 1, 1, 1]]


### Handling sentences of different lengths
* In neural networks all the input data needs to be of same length
* Hence we are using padding here
* Padding can be at the start in sequence or end

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded = pad_sequences(sequences)
padded

array([[0, 0, 2, 3, 1, 4],
       [0, 0, 2, 3, 1, 5],
       [0, 0, 2, 3, 1, 4],
       [6, 7, 8, 1, 4, 9]])

* padding example after the sentence

In [15]:
padded = pad_sequences(sequences, padding='post')
padded

array([[2, 3, 1, 4, 0, 0],
       [2, 3, 1, 5, 0, 0],
       [2, 3, 1, 4, 0, 0],
       [6, 7, 8, 1, 4, 9]])

In [16]:
padded = pad_sequences(sequences, padding='post', maxlen=5, truncating='post')
padded

array([[2, 3, 1, 4, 0],
       [2, 3, 1, 5, 0],
       [2, 3, 1, 4, 0],
       [6, 7, 8, 1, 4]])