<a href="https://colab.research.google.com/github/ankitrgupta1/Deep-Learning/blob/master/NLP_Tensorflow_Tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Natural Language Processing With Tensorflow**

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
data = ['i love my india', 'I love USA', 'I, love both India and usa!' ]

In [None]:
#instantiating Tokenizer with 100 words
tokenizer = Tokenizer(num_words=100)

#fitting the tokenizer on the text data

tokenizer.fit_on_texts(data)

# Generating word index over the data

word_index = tokenizer.word_index

# printing out the word index to see the indexing of each of the words of the text processed by the Tokenizer
# Note : Look closely and you would find that the word indexes doesn't have the punctuation
# Also the tokenizer lowercased all the words before indexing on them. Hence 'I' and 'i' are indxed as  'i'
print(word_index)

{'i': 1, 'love': 2, 'india': 3, 'usa': 4, 'my': 5, 'both': 6, 'and': 7}


In [None]:
# Generating sequences corrsponding to the text sentences in data

sequences = tokenizer.texts_to_sequences(data)

# printing out the sequences formed

# Notice, that the words are now replacced with the corresponding indexes in the sequence
print(sequences)


[[1, 2, 5, 3], [1, 2, 4], [1, 2, 6, 3, 7, 4]]


In [None]:
# Let's check what would happen if we generate some sequences for out test data from the tokenizer learned on our data

test_data = ['I really love India', 'Everyone love usa and india as well']

# Note that the sentences have words which were not present earlier in our data

test_sequence = tokenizer.texts_to_sequences(test_data)

# printing the test sequence

print(test_sequence)

# notice the new sequence is missing  the indexes coreesponding to the new word which were not present in our original data

[[1, 2, 3], [2, 4, 7, 3]]


In [None]:
# Using the 'out of vocalbulary' token for the missing words

#instantiating Tokenizer with 100 words
oov_tokenizer = Tokenizer(num_words=100, oov_token='oov')

#fitting the tokenizer on the text data

oov_tokenizer.fit_on_texts(data)

# Generating word index over the data

word_index = oov_tokenizer.word_index

# printing out the word index to see the indexing of each of the words of the text processed by the Tokenizer
# Note : Look closely and you would find that the word indexes doesn't have the punctuation
# Also the tokenizer lowercased all the words before indexing on them. Hence 'I' and 'i' are indxed as  'i'
print(word_index)

oov_sequences = oov_tokenizer.texts_to_sequences(data)

# printing out the sequences formed

# Notice, that the words are now replacced with the corresponding indexes in the sequence
print(oov_sequences)

# Let's check what would happen if we generate some sequences for out test data from the tokenizer learned on our data

test_data = ['I really love India', 'Everyone love usa and india as well']

# Note that the sentences have words which were not present earlier in our data

oov_test_sequence = oov_tokenizer.texts_to_sequences(test_data)

# printing the test sequence

print(oov_test_sequence)

# Notice the oov token assigned 1 as index
# also now the missing words are referenced as oov index(i.e., 1)

{'oov': 1, 'i': 2, 'love': 3, 'india': 4, 'usa': 5, 'my': 6, 'both': 7, 'and': 8}
[[2, 3, 6, 4], [2, 3, 5], [2, 3, 7, 4, 8, 5]]
[[2, 1, 3, 4], [1, 3, 5, 8, 4, 1, 1]]


In [None]:
# adding pad to the sequences for making them of the same length for our network

padded_sequence = pad_sequences(oov_sequences)

print('padded_sequences = \n',padded_sequence)

# in case you want pad the sequences at the end the write it as below
# if you want to control the size of the sequences you can use the maxlen parameter for it.
# Note that the sequence is dropped from the  start
# you can add the parameter "trucating='post' for dropping from the back"
print('==============================')
post_pad_sequences = pad_sequences(oov_sequences, padding='post', maxlen=5)
print('post_pad_sequences = \n',post_pad_sequences)

padded_sequences = 
 [[0 0 2 3 6 4]
 [0 0 0 2 3 5]
 [2 3 7 4 8 5]]
post_pad_sequences = 
 [[2 3 6 4 0]
 [2 3 5 0 0]
 [3 7 4 8 5]]
