In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
sentences = ['I love my dog','I love my cat','I am going to move to SF']

In [None]:
tokenizer = Tokenizer(num_words = 100, oov_token = '<OOV>') #It catches the exceptions like special characters

#### Words are represented as numbers called tokens

In [None]:
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'i': 2, 'love': 3, 'my': 4, 'to': 5, 'dog': 6, 'cat': 7, 'am': 8, 'going': 9, 'move': 10, 'sf': 11}


#### Sequences of numbers from sentences

In [None]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[2, 3, 4, 6], [2, 3, 4, 7], [2, 8, 9, 5, 10, 5, 11]]


#### But it does not work in test data if new words are present so we need OOV_token (out of vocabulary) to handle that.

In [None]:
sentences1 = ['I love going for a walk with my dog','I love my cat','I am going to move to SF']

In [None]:
sequences1 = tokenizer.texts_to_sequences(sentences1)
print(sequences1)

[[2, 3, 9, 1, 1, 1, 1, 4, 6], [2, 3, 4, 7], [2, 8, 9, 5, 10, 5, 11]]


#### To handle sentence with diff sizes - pad_sequences (Pad the sequences- make them same length by adding 0 at the begenning or ending)

In [None]:
padded = pad_sequences(sequences1,padding = 'post')
print(padded)

[[ 2  3  9  1  1  1  1  4  6]
 [ 2  3  4  7  0  0  0  0  0]
 [ 2  8  9  5 10  5 11  0  0]]


In [None]:
padded2 = pad_sequences(sequences1,padding = 'post',maxlen = 5)
print(padded2)

[[ 1  1  1  4  6]
 [ 2  3  4  7  0]
 [ 9  5 10  5 11]]


In [None]:
padded3 = pad_sequences(sequences1,padding = 'post',maxlen = 5, truncating='post')
print(padded3)

[[ 2  3  9  1  1]
 [ 2  3  4  7  0]
 [ 2  8  9  5 10]]


#### Classifier to identify sentiment in the text