# Data prepocessing

## Tokenize words using tensorflow's tools

In [24]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [25]:
sentences = [
    'I love my dog',
    'I love my cat',
    'Do you think my dog is amazing ?'
]

In [26]:
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

print(word_index)

{'my': 1, 'i': 2, 'love': 3, 'dog': 4, 'cat': 5, 'do': 6, 'you': 7, 'think': 8, 'is': 9, 'amazing': 10}


## Sequencing - Turning sentences into data

In [27]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[2, 3, 1, 4], [2, 3, 1, 5], [6, 7, 8, 1, 4, 9, 10]]


In [28]:
test_data = [ # contains some new words to tokenizer
    'I really love my dog',
    'My dog likes my manatee'
]
test_seq = tokenizer.texts_to_sequences(test_data)

print(test_seq)

[[2, 3, 1, 4], [1, 4, 1]]


### Using an Out Of Vocabulary token in order to not to lose the length of the sequence

In [29]:
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
word_index = tokenizer.fit_on_texts(sentences)
test_seq = tokenizer.texts_to_sequences(test_data)

print(word_index)
print(test_seq)

None
[[3, 1, 4, 2, 5], [2, 5, 1, 2, 1]]


### Redefining data, make all sequences in an equal length

#### There is an advanced solution called "Ragged Tensor", I will keep it for later. For now we will use padding

In [30]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [31]:
padded = pad_sequences(sequences) # params :: padding='pre/post', truncate='pre/post', maxlen=int
print(padded)

[[ 0  0  0  2  3  1  4]
 [ 0  0  0  2  3  1  5]
 [ 6  7  8  1  4  9 10]]
