### Encoding sentences into series of numbers. 

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [30]:
train_sentences = ['It is a sunny day!', 
                   'It is a cloudy day', 
                   'Is it going to rain today?']

In [31]:
tokenizer = Tokenizer(num_words=100)

In [32]:
tokenizer.fit_on_texts(train_sentences)

In [33]:
tokenizer.word_index

{'it': 1,
 'is': 2,
 'a': 3,
 'day': 4,
 'sunny': 5,
 'cloudy': 6,
 'going': 7,
 'to': 8,
 'rain': 9,
 'today': 10}

In [34]:
### This is the encoding of the words

In [36]:
seqs = tokenizer.texts_to_sequences(train_sentences)

In [37]:
seqs

[[1, 2, 3, 5, 4], [1, 2, 3, 6, 4], [2, 1, 7, 8, 9, 10]]

In [8]:
new_sentences = ['Hello, how are you?', 
                'It is a pleasant day!', 
                 'I like to go for a walk while it rains!'
                ]

In [9]:
# the new words will not be encoded to numbers 
tokenizer.texts_to_sequences(new_sentences)

[[], [1, 2, 3, 4], [8, 3, 1]]

#### How to replace newly encountered values with special values

In [10]:
tokenizer = Tokenizer(num_words=100, oov_token='<oov>') # all new words will be encoded to 1
tokenizer.fit_on_texts(train_sentences)

In [11]:
tokenizer.texts_to_sequences(new_sentences)

[[1, 1, 1, 1], [2, 3, 4, 1, 5], [1, 1, 9, 1, 1, 4, 1, 1, 2, 1]]

### Padding sentences 

In [21]:
more_sentences = ['Hello, how are you? I\'m good!', 
                  'It is a pleasant day!', 
                  'I like to go for a walk while it rains!']

In [22]:
tokenizer = Tokenizer(num_words=100, oov_token='<oov>') # all new words will be encoded to 1
tokenizer.fit_on_texts(more_sentences)
word_index = tokenizer.word_index

In [23]:
more_sequences = tokenizer.texts_to_sequences(more_sentences)

In [24]:
print(word_index)

{'<oov>': 1, 'it': 2, 'a': 3, 'hello': 4, 'how': 5, 'are': 6, 'you': 7, "i'm": 8, 'good': 9, 'is': 10, 'pleasant': 11, 'day': 12, 'i': 13, 'like': 14, 'to': 15, 'go': 16, 'for': 17, 'walk': 18, 'while': 19, 'rains': 20}


In [25]:
padded_seq = pad_sequences(more_sequences)

In [26]:
padded_seq

array([[ 0,  0,  0,  0,  4,  5,  6,  7,  8,  9],
       [ 0,  0,  0,  0,  0,  2, 10,  3, 11, 12],
       [13, 14, 15, 16, 17,  3, 18, 19,  2, 20]])

In [27]:
more_sequences

[[4, 5, 6, 7, 8, 9],
 [2, 10, 3, 11, 12],
 [13, 14, 15, 16, 17, 3, 18, 19, 2, 20]]

#### Customize padded sequences

In [28]:
padded_seqs = pad_sequences(more_sequences, 
                            padding='post', 
                            maxlen=5,
                            truncating='post')
                            

In [29]:
padded_seqs

array([[ 4,  5,  6,  7,  8],
       [ 2, 10,  3, 11, 12],
       [13, 14, 15, 16, 17]])