### Import the libraries

In [17]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [18]:
lines = ['It was a nice rainy day.', 'The things are so beautiful in his point.', 'When your focus is clear, you won.', 'Many many happy returns of the day']

In [19]:
tokenizer = Tokenizer() # tensorflow method for tokenizing
tokenizer.fit_on_texts(lines)

In [20]:
tokenizer.word_docs # checking out the words and their frequency count in the data

defaultdict(int,
            {'nice': 1,
             'was': 1,
             'rainy': 1,
             'it': 1,
             'day': 2,
             'a': 1,
             'in': 1,
             'so': 1,
             'his': 1,
             'beautiful': 1,
             'things': 1,
             'the': 2,
             'point': 1,
             'are': 1,
             'is': 1,
             'won': 1,
             'your': 1,
             'you': 1,
             'when': 1,
             'clear': 1,
             'focus': 1,
             'returns': 1,
             'of': 1,
             'happy': 1,
             'many': 1})

In [21]:
tokenizer.index_word # every word is assigned an index

{1: 'day',
 2: 'the',
 3: 'many',
 4: 'it',
 5: 'was',
 6: 'a',
 7: 'nice',
 8: 'rainy',
 9: 'things',
 10: 'are',
 11: 'so',
 12: 'beautiful',
 13: 'in',
 14: 'his',
 15: 'point',
 16: 'when',
 17: 'your',
 18: 'focus',
 19: 'is',
 20: 'clear',
 21: 'you',
 22: 'won',
 23: 'happy',
 24: 'returns',
 25: 'of'}

In [22]:
tokenizer.index_docs # combining the above two attibutes

defaultdict(int,
            {7: 1,
             5: 1,
             8: 1,
             4: 1,
             1: 2,
             6: 1,
             13: 1,
             11: 1,
             14: 1,
             12: 1,
             9: 1,
             2: 2,
             15: 1,
             10: 1,
             19: 1,
             22: 1,
             17: 1,
             21: 1,
             16: 1,
             20: 1,
             18: 1,
             24: 1,
             25: 1,
             23: 1,
             3: 1})

In [23]:
tokenizer.word_index # generates dictionary with index, according to our convenience

{'day': 1,
 'the': 2,
 'many': 3,
 'it': 4,
 'was': 5,
 'a': 6,
 'nice': 7,
 'rainy': 8,
 'things': 9,
 'are': 10,
 'so': 11,
 'beautiful': 12,
 'in': 13,
 'his': 14,
 'point': 15,
 'when': 16,
 'your': 17,
 'focus': 18,
 'is': 19,
 'clear': 20,
 'you': 21,
 'won': 22,
 'happy': 23,
 'returns': 24,
 'of': 25}

In [28]:
mats = tokenizer.texts_to_matrix(lines) # similar to one hot encoding categorical data in the usual dataset
mats

array([[0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
       [0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 1., 1.]])

In [29]:
mats.shape # 4 sentences, 26 unique words in the corpus

(4, 26)

In [30]:
# sequences
seq = tokenizer.texts_to_sequences(lines)
seq

[[4, 5, 6, 7, 8, 1],
 [2, 9, 10, 11, 12, 13, 14, 15],
 [16, 17, 18, 19, 20, 21, 22],
 [3, 3, 23, 24, 25, 2, 1]]

In [31]:
# pre padding: equals the length of the different sentence sequences by appending 0s at the start
padded = pad_sequences(seq, maxlen=10, padding='pre')
padded

array([[ 0,  0,  0,  0,  4,  5,  6,  7,  8,  1],
       [ 0,  0,  2,  9, 10, 11, 12, 13, 14, 15],
       [ 0,  0,  0, 16, 17, 18, 19, 20, 21, 22],
       [ 0,  0,  0,  3,  3, 23, 24, 25,  2,  1]], dtype=int32)

In [32]:
# post padding: equals the length of the different sentence sequences by appending 0s at the end
padded = pad_sequences(seq, maxlen=10, padding='post')
padded

array([[ 4,  5,  6,  7,  8,  1,  0,  0,  0,  0],
       [ 2,  9, 10, 11, 12, 13, 14, 15,  0,  0],
       [16, 17, 18, 19, 20, 21, 22,  0,  0,  0],
       [ 3,  3, 23, 24, 25,  2,  1,  0,  0,  0]], dtype=int32)


Both of the above padding processes above result in a Continuous Bag of Words