In [3]:
#Importing the libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [10]:
sentences = [
    'I love my cat',
    'I love my dog'
]

In [11]:
#Building a simple tokenizer
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'i': 1, 'love': 2, 'my': 3, 'cat': 4, 'dog': 5}


In [14]:
#Demonstrates that tokenizer deals with CASE and punctuation marks
sentences = [
    'i love my cat',
    'I love my dog',
    'You love my dog!'
]

In [15]:
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


In [16]:
sentences = [
    'i love my cat',
    'I love my dog',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

In [17]:
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}


In [18]:
#Generating seaqeuences of the sentences based on dictionary look up
sequences = tokenizer.texts_to_sequences(sentences)

In [19]:
sequences

[[4, 2, 1, 6], [4, 2, 1, 3], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]

In [20]:
#Test data
test_data = [
    'i really love my dog',
    'my dog loves my manatee'
]

In [21]:
#Tokenizer unable to deal with unseen data
test_sequences = tokenizer.texts_to_sequences(test_data)
test_sequences

[[4, 2, 1, 3], [1, 3, 1]]

In [22]:
sentences = [
    'i love my cat',
    'I love my dog',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

In [23]:
#Adds Out of Vocabulary to tokenizer
tokenizer = Tokenizer(num_words = 100, oov_token = "<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


In [24]:
test_sequences = tokenizer.texts_to_sequences(test_data)
test_sequences

[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]