In [26]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [27]:
train_sentences = ["Today is a sunny day", "Today I go fishing.", "Yes, I can"]

In [28]:
tokenizer = Tokenizer(num_words=100)

In [29]:
tokenizer.fit_on_texts(train_sentences)

In [30]:
word_index = tokenizer.word_index

In [31]:
word_index

{'today': 1,
 'i': 2,
 'is': 3,
 'a': 4,
 'sunny': 5,
 'day': 6,
 'go': 7,
 'fishing': 8,
 'yes': 9,
 'can': 10}

In [32]:
word_count = tokenizer.word_counts

In [33]:
word_count

OrderedDict([('today', 2),
             ('is', 1),
             ('a', 1),
             ('sunny', 1),
             ('day', 1),
             ('i', 2),
             ('go', 1),
             ('fishing', 1),
             ('yes', 1),
             ('can', 1)])

In [34]:
sequences = tokenizer.texts_to_sequences(train_sentences)

In [35]:
print(word_index)
print(sequences)

{'today': 1, 'i': 2, 'is': 3, 'a': 4, 'sunny': 5, 'day': 6, 'go': 7, 'fishing': 8, 'yes': 9, 'can': 10}
[[1, 3, 4, 5, 6], [1, 2, 7, 8], [9, 2, 10]]


#### Tokenizing new Data using the same tokenizer.

In [38]:
new_sentences = ["Will it be raining Today ?", "Today is a pleasant day"]

In [39]:
new_sequences = tokenizer.texts_to_sequences(new_sentences)

In [40]:
print(new_sentences)
print(new_sequences)


['Will it be raining Today ?', 'Today is a pleasant day']
[[1], [1, 3, 4, 6]]


#### Replacing newly encountered words with special values.

In [41]:
tokenizer = Tokenizer(num_words=100, oov_token= "<oov>")

In [42]:
tokenizer.fit_on_texts(train_sentences)

In [43]:
word_index = tokenizer.word_index

In [44]:
new_sequences = tokenizer.texts_to_sequences(new_sentences)

#### To the words that are never encountered before we assign a special value( 1 in this case)

In [47]:
print(word_index)
print(new_sequences)

{'<oov>': 1, 'today': 2, 'i': 3, 'is': 4, 'a': 5, 'sunny': 6, 'day': 7, 'go': 8, 'fishing': 9, 'yes': 10, 'can': 11}
[[1, 1, 1, 1, 2], [2, 4, 5, 1, 7]]
