<a href="https://colab.research.google.com/github/bhandary/tensorflow2/blob/main/nlp/first_sequence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

# Initial Tokenization

In [4]:
sentences = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny day?'
]

In [5]:
# Initial tokenization of the corpus, no OOV used
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
print(word_index)
print(sequences)

{'is': 1, 'day': 2, 'today': 3, 'a': 4, 'sunny': 5, 'rainy': 6, 'it': 7}
[[3, 1, 4, 5, 2], [3, 1, 4, 6, 2], [1, 7, 5, 2]]


# Exploring Test Data with unseen words

In [7]:
test_data = [
    'Today is a snowy day',
    'Will it be rainy tomorrow?'
]

In [8]:
test_sequences = tokenizer.texts_to_sequences(test_data)
print(word_index)
print(test_sequences)

{'is': 1, 'day': 2, 'today': 3, 'a': 4, 'sunny': 5, 'rainy': 6, 'it': 7}
[[3, 1, 4, 2], [7, 6]]


# Adding OOV to improve test data sequences

In [9]:
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

test_sequences = tokenizer.texts_to_sequences(test_data)
print(word_index)
print(test_sequences)

{'<OOV>': 1, 'is': 2, 'day': 3, 'today': 4, 'a': 5, 'sunny': 6, 'rainy': 7, 'it': 8}
[[4, 2, 5, 1, 3], [1, 8, 1, 7, 1]]


# Exploring Padding

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
sentences = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny today?'
    'I really enjoyed walking in the snow today'
]

In [12]:
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[2, 3, 4, 5, 6], [2, 3, 4, 7, 6], [3, 8, 5, 2, 9, 10, 11, 12, 13, 14, 15, 2]]


In [13]:
padded = pad_sequences(sequences)
print(padded)

[[ 0  0  0  0  0  0  0  2  3  4  5  6]
 [ 0  0  0  0  0  0  0  2  3  4  7  6]
 [ 3  8  5  2  9 10 11 12 13 14 15  2]]


In [14]:
padded = pad_sequences(sequences, padding='post')
print(padded)

[[ 2  3  4  5  6  0  0  0  0  0  0  0]
 [ 2  3  4  7  6  0  0  0  0  0  0  0]
 [ 3  8  5  2  9 10 11 12 13 14 15  2]]


In [15]:
padded = pad_sequences(sequences, padding='post', maxlen=6)
print(padded)

[[ 2  3  4  5  6  0]
 [ 2  3  4  7  6  0]
 [11 12 13 14 15  2]]


In [16]:
padded = pad_sequences(sequences, padding='post', maxlen=6, truncating='post')
print(padded)



[[ 2  3  4  5  6  0]
 [ 2  3  4  7  6  0]
 [ 3  8  5  2  9 10]]
