# Imports

In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sentence

In [4]:
sentences = [
    'My favorite food is ice cream',
    'do you like ice cream too?',
    'My dog likes ice cream!',
    "your favorite flavor of icecream is chocolate",
    "chocolate isn't good for dogs",
    "your dog, your cat, and your parrot prefer broccoli"
]
print(sentences)

['My favorite food is ice cream', 'do you like ice cream too?', 'My dog likes ice cream!', 'your favorite flavor of icecream is chocolate', "chocolate isn't good for dogs", 'your dog, your cat, and your parrot prefer broccoli']


# Create the tokenizer and define an out of vocabulary token

In [5]:
tokenizer = Tokenizer(num_words=len(sentences),oov_token="<OOV>")

In [6]:
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'your': 2, 'ice': 3, 'cream': 4, 'my': 5, 'favorite': 6, 'is': 7, 'dog': 8, 'chocolate': 9, 'food': 10, 'do': 11, 'you': 12, 'like': 13, 'too': 14, 'likes': 15, 'flavor': 16, 'of': 17, 'icecream': 18, "isn't": 19, 'good': 20, 'for': 21, 'dogs': 22, 'cat': 23, 'and': 24, 'parrot': 25, 'prefer': 26, 'broccoli': 27}


In [8]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[5, 1, 1, 1, 3, 4], [1, 1, 1, 3, 4, 1], [5, 1, 1, 3, 4], [2, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [2, 1, 2, 1, 1, 2, 1, 1, 1]]


# Add padding

In [10]:
padded = pad_sequences(sequences)
print("\n Word Index = ",word_index)
print("\n Sequences =",sequences)
print("\n Padded Sequences: \n",padded)


 Word Index =  {'<OOV>': 1, 'your': 2, 'ice': 3, 'cream': 4, 'my': 5, 'favorite': 6, 'is': 7, 'dog': 8, 'chocolate': 9, 'food': 10, 'do': 11, 'you': 12, 'like': 13, 'too': 14, 'likes': 15, 'flavor': 16, 'of': 17, 'icecream': 18, "isn't": 19, 'good': 20, 'for': 21, 'dogs': 22, 'cat': 23, 'and': 24, 'parrot': 25, 'prefer': 26, 'broccoli': 27}

 Sequences = [[5, 1, 1, 1, 3, 4], [1, 1, 1, 3, 4, 1], [5, 1, 1, 3, 4], [2, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [2, 1, 2, 1, 1, 2, 1, 1, 1]]

 Padded Sequences:
 [[0 0 0 5 1 1 1 3 4]
 [0 0 0 1 1 1 3 4 1]
 [0 0 0 0 5 1 1 3 4]
 [0 0 2 1 1 1 1 1 1]
 [0 0 0 0 1 1 1 1 1]
 [2 1 2 1 1 2 1 1 1]]


In [11]:
# Specify a max lenght for the padded sequene
padded = pad_sequences(sequences,maxlen=15)
print(padded)

[[0 0 0 0 0 0 0 0 0 5 1 1 1 3 4]
 [0 0 0 0 0 0 0 0 0 1 1 1 3 4 1]
 [0 0 0 0 0 0 0 0 0 0 5 1 1 3 4]
 [0 0 0 0 0 0 0 0 2 1 1 1 1 1 1]
 [0 0 0 0 0 0 0 0 0 0 1 1 1 1 1]
 [0 0 0 0 0 0 2 1 2 1 1 2 1 1 1]]


In [12]:
# Put the padding at the end of the sequences
padded = pad_sequences(sequences,maxlen=15,padding="post")
print(padded)

[[5 1 1 1 3 4 0 0 0 0 0 0 0 0 0]
 [1 1 1 3 4 1 0 0 0 0 0 0 0 0 0]
 [5 1 1 3 4 0 0 0 0 0 0 0 0 0 0]
 [2 1 1 1 1 1 1 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 0 0 0 0 0 0 0 0 0 0]
 [2 1 2 1 1 2 1 1 1 0 0 0 0 0 0]]


In [14]:
# Limit the lenght of the sequences , you will see some sequences get truncated
padded = pad_sequences(sequences,maxlen=3)
print(padded)

[[1 3 4]
 [3 4 1]
 [1 3 4]
 [1 1 1]
 [1 1 1]
 [1 1 1]]


In [15]:
# Try turning sentences that contain words that
# aren't in the word index into sequences.
# Add your own senteces to the test_data
test_data = [
    "my best friend's favorite ice cream flavor is strawberry",
    "my dog's best friend is a manatee"
]
print(test_data)
# Remind ourselves which number corresponds to the
# out of vocabulary token in the word index
print("<OOV> has the number", word_index['<OOV>'], "in the word index.")

# Convert the test sentences to sequences
test_seq = tokenizer.texts_to_sequences(test_data)
print("\nTest Sequence = ", test_seq)

# Pad the new sequences
padded = pad_sequences(test_seq, maxlen=10)
print("\nPadded Test Sequence: ")

# Notice that "1" appears in the sequence wherever there's a word 
# that's not in the word index
print(padded)

["my best friend's favorite ice cream flavor is strawberry", "my dog's best friend is a manatee"]
<OOV> has the number 1 in the word index.

Test Sequence =  [[5, 1, 1, 1, 3, 4, 1, 1, 1], [5, 1, 1, 1, 1, 1, 1]]

Padded Test Sequence: 
[[0 5 1 1 1 3 4 1 1 1]
 [0 0 0 5 1 1 1 1 1 1]]
