In [1]:
import tensorflow as tf 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
sentences = ['Its a sunny day today', 
            'My favourite food is biryani', 
            'I like chocolate ice cream', 
            'I like dark chocolate', 
            'My car is jet black Tesla', 
            'Your car is royal blue Tesla']

In [3]:
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)

In [4]:
tokenizer.word_index

{'<OOV>': 1,
 'is': 2,
 'my': 3,
 'i': 4,
 'like': 5,
 'chocolate': 6,
 'car': 7,
 'tesla': 8,
 'its': 9,
 'a': 10,
 'sunny': 11,
 'day': 12,
 'today': 13,
 'favourite': 14,
 'food': 15,
 'biryani': 16,
 'ice': 17,
 'cream': 18,
 'dark': 19,
 'jet': 20,
 'black': 21,
 'your': 22,
 'royal': 23,
 'blue': 24}

In [5]:
tokenizer.word_index['tesla']

8

In [6]:
## Creating Sequences

In [7]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[9, 10, 11, 12, 13],
 [3, 14, 15, 2, 16],
 [4, 5, 6, 17, 18],
 [4, 5, 19, 6],
 [3, 7, 2, 20, 21, 8],
 [22, 7, 2, 23, 24, 8]]

In [8]:
## OOV

In [9]:
sentences_2 = ['What is your name?', 'I adore cats', 'I love programming']

In [10]:
sequences_2 = tokenizer.texts_to_sequences(sentences_2)
sequences_2

[[1, 2, 22, 1], [4, 1, 1], [4, 1, 1]]

In [11]:
tokenizer.sequences_to_texts(sequences_2)

['<OOV> is your <OOV>', 'i <OOV> <OOV>', 'i <OOV> <OOV>']

In [12]:
tokenizer.sequences_to_texts(sequences)

['its a sunny day today',
 'my favourite food is biryani',
 'i like chocolate ice cream',
 'i like dark chocolate',
 'my car is jet black tesla',
 'your car is royal blue tesla']

In [13]:
## Padding 

In [14]:
padded = pad_sequences(sequences)

In [15]:
padded

array([[ 0,  9, 10, 11, 12, 13],
       [ 0,  3, 14, 15,  2, 16],
       [ 0,  4,  5,  6, 17, 18],
       [ 0,  0,  4,  5, 19,  6],
       [ 3,  7,  2, 20, 21,  8],
       [22,  7,  2, 23, 24,  8]], dtype=int32)

In [16]:
sequences

[[9, 10, 11, 12, 13],
 [3, 14, 15, 2, 16],
 [4, 5, 6, 17, 18],
 [4, 5, 19, 6],
 [3, 7, 2, 20, 21, 8],
 [22, 7, 2, 23, 24, 8]]

In [17]:
tokenizer.word_index

{'<OOV>': 1,
 'is': 2,
 'my': 3,
 'i': 4,
 'like': 5,
 'chocolate': 6,
 'car': 7,
 'tesla': 8,
 'its': 9,
 'a': 10,
 'sunny': 11,
 'day': 12,
 'today': 13,
 'favourite': 14,
 'food': 15,
 'biryani': 16,
 'ice': 17,
 'cream': 18,
 'dark': 19,
 'jet': 20,
 'black': 21,
 'your': 22,
 'royal': 23,
 'blue': 24}

In [None]:
## Specify the max length for padded sequence 

In [19]:
pad_sequences(sequences, maxlen=3)

array([[11, 12, 13],
       [15,  2, 16],
       [ 6, 17, 18],
       [ 5, 19,  6],
       [20, 21,  8],
       [23, 24,  8]], dtype=int32)

In [20]:
pad_sequences(sequences, maxlen=15)

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  9, 10, 11, 12, 13],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3, 14, 15,  2, 16],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  5,  6, 17, 18],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  5, 19,  6],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  7,  2, 20, 21,  8],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 22,  7,  2, 23, 24,  8]],
      dtype=int32)

In [None]:
## Specify padding at the end

In [21]:
pad_sequences(sequences, maxlen=15, padding='post')

array([[ 9, 10, 11, 12, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3, 14, 15,  2, 16,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 4,  5,  6, 17, 18,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 4,  5, 19,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3,  7,  2, 20, 21,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [22,  7,  2, 23, 24,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=int32)

In [None]:
## OOV

In [23]:
pad_sequences(sequences_2, maxlen=10, padding='post')

array([[ 1,  2, 22,  1,  0,  0,  0,  0,  0,  0],
       [ 4,  1,  1,  0,  0,  0,  0,  0,  0,  0],
       [ 4,  1,  1,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)