## Getting started with tokenization

In [14]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
sentences = ['Today is a sunny day',
             'Today is a rainy day']

tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

In [3]:
word_index

{'today': 1, 'is': 2, 'a': 3, 'day': 4, 'sunny': 5, 'rainy': 6}

## Turning sequences into sequences

In [5]:
sentences = ['Today is a sunny day',
             'Today is a rainy day',
             'Is it sunny today?']

tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)

In [6]:
sequences

[[1, 2, 3, 4, 5], [1, 2, 3, 6, 5], [2, 7, 4, 1]]

##### out of vocabulary tokens

In [7]:
test_data = ['Today is a snowy day',
             'Will it be rainy tomorrow?']
test_sequences = tokenizer.texts_to_sequences(test_data)

In [8]:
test_sequences

[[1, 2, 3, 5], [7, 6]]

In [9]:
tokenizer = Tokenizer(num_words=100, oov_token='<oov>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)
test_sequences = tokenizer.texts_to_sequences(test_data)

In [10]:
word_index

{'<oov>': 1,
 'today': 2,
 'is': 3,
 'a': 4,
 'sunny': 5,
 'day': 6,
 'rainy': 7,
 'it': 8}

In [11]:
test_sequences

[[2, 3, 4, 1, 6], [1, 8, 1, 7, 1]]

##### padding

In [12]:
sentences = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny today?',
    'I really enjoyed walking in the snow today'
]

tokenizer = Tokenizer(num_words=100, oov_token='<oov>')
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)

In [13]:
sequences

[[2, 3, 4, 5, 6],
 [2, 3, 4, 7, 6],
 [3, 8, 5, 2],
 [9, 10, 11, 12, 13, 14, 15, 2]]

In [15]:
padded = pad_sequences(sequences)

In [16]:
padded

array([[ 0,  0,  0,  2,  3,  4,  5,  6],
       [ 0,  0,  0,  2,  3,  4,  7,  6],
       [ 0,  0,  0,  0,  3,  8,  5,  2],
       [ 9, 10, 11, 12, 13, 14, 15,  2]])

In [17]:
padded = pad_sequences(sequences, padding='post')

In [18]:
padded

array([[ 2,  3,  4,  5,  6,  0,  0,  0],
       [ 2,  3,  4,  7,  6,  0,  0,  0],
       [ 3,  8,  5,  2,  0,  0,  0,  0],
       [ 9, 10, 11, 12, 13, 14, 15,  2]])

In [19]:
padded = pad_sequences(sequences, padding='post', maxlen=6)

In [20]:
padded

array([[ 2,  3,  4,  5,  6,  0],
       [ 2,  3,  4,  7,  6,  0],
       [ 3,  8,  5,  2,  0,  0],
       [11, 12, 13, 14, 15,  2]])

In [21]:
padded = pad_sequences(sequences, padding='post', maxlen=6, truncating='post')

In [22]:
padded

array([[ 2,  3,  4,  5,  6,  0],
       [ 2,  3,  4,  7,  6,  0],
       [ 3,  8,  5,  2,  0,  0],
       [ 9, 10, 11, 12, 13, 14]])