In [2]:
# Importing TensorFlow APIs
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

## Define training sentences

In [3]:
# Define the training sentences
train_sentences = [
    'It is a sunny day',
    'It is not so sunny, but rainy instead',
    'It is still raining, but not so sunny anymore.',
    'Will it rain today?',
    'It is a cloudy day'
]

## Configure the tokenizer

In [4]:
# Create an instance of the tokenizer
tokenizer = Tokenizer(num_words=200)

# Train the tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

# Store workd index for the words in the sence using the word_index attribute
word_index = tokenizer.word_index

In [5]:
# Preview the word index
print(word_index)

{'it': 1, 'is': 2, 'sunny': 3, 'a': 4, 'day': 5, 'not': 6, 'so': 7, 'but': 8, 'rainy': 9, 'instead': 10, 'still': 11, 'raining': 12, 'anymore': 13, 'will': 14, 'rain': 15, 'today': 16, 'cloudy': 17}


Notice that the word index ignored punctuation.

## Create sequences

In [6]:
# Create sequences using tokenizer's attribute texts_to_sequences
sequences = tokenizer.texts_to_sequences(train_sentences)

In [7]:
# Preview the word index dictionary as well as the sequences
print(f"Word index -->{word_index}") # Vocabulary
print(f"Sequences of words -->{sequences}")

Word index -->{'it': 1, 'is': 2, 'sunny': 3, 'a': 4, 'day': 5, 'not': 6, 'so': 7, 'but': 8, 'rainy': 9, 'instead': 10, 'still': 11, 'raining': 12, 'anymore': 13, 'will': 14, 'rain': 15, 'today': 16, 'cloudy': 17}
Sequences of words -->[[1, 2, 4, 3, 5], [1, 2, 6, 7, 3, 8, 9, 10], [1, 2, 11, 12, 8, 6, 7, 3, 13], [14, 1, 15, 16], [1, 2, 4, 17, 5]]


Let's look at a word from the vocabulary along with a sequence.

In [8]:
# Print a sample sentence and a sample sequence
print(train_sentences[0])
print(sequences[0])

It is a sunny day
[1, 2, 4, 3, 5]


## Tokenizing new data using the same tokenizer

In [9]:
new_sentences = [
    'Will it be raining today?',
    'It is a pleasant day.'
]

In [10]:
new_sequences = tokenizer.texts_to_sequences(new_sentences)

In [11]:
print(new_sentences)
print(new_sequences)

['Will it be raining today?', 'It is a pleasant day.']
[[14, 1, 12, 16], [1, 2, 4, 5]]


## Replacing newly encountered words with special values.

In [12]:
# Set up the tokenizer again with oov_taken
tokenizer = Tokenizer(num_words=100, oov_token = "<oov>")

# Train the new tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

# Store word index for the words in the sentence
word_index = tokenizer.word_index

In [13]:
# Create sequences of the new sentences
new_sequences = tokenizer.texts_to_sequences(new_sentences)
print(word_index)
print(new_sequences)

{'<oov>': 1, 'it': 2, 'is': 3, 'sunny': 4, 'a': 5, 'day': 6, 'not': 7, 'so': 8, 'but': 9, 'rainy': 10, 'instead': 11, 'still': 12, 'raining': 13, 'anymore': 14, 'will': 15, 'rain': 16, 'today': 17, 'cloudy': 18}
[[15, 2, 1, 13, 17], [2, 3, 5, 1, 6]]


###### ============ 

## Other practice

In [28]:
# Quick example from the tutorial:
from tensorflow.python.client import device_lib
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
device_lib.list_local_devices()

Num GPUs Available:  0


[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 15200346050800325618
 xla_global_id: -1]