In [1]:
'''
This code will walk through the core steps: taking a small text corpus, automatically building a vocabulary from it,
and then using that vocabulary to convert text to numbers and back again.

How This Code Relates to the Tokenization Guide provided to you as a pdf

    Corpus: The corpus list is our small-scale version of the text data mentioned in Step 1.

    Strategy & Training: tf.keras.layers.TextVectorization acts as our word-based tokenizer. The .adapt(corpus) method performs Step 2 and 3,
     automatically learning the vocabulary from the data.

    Special Tokens: When you inspect the vocabulary, you'll see it automatically includes '' (for padding, which maps to 0) and [UNK]
    (for unknown words, which maps to 1). This corresponds to Step 4.

    Final Tokenizer: The text_vectorizer object is our finalized tokenizer, ready to perform the encoding and decoding tasks described in Step 5.

'''

"\nThis code will walk through the core steps: taking a small text corpus, automatically building a vocabulary from it,\nand then using that vocabulary to convert text to numbers and back again.\n\nHow This Code Relates to the Tokenization Guide provided to you as a pdf\n\n    Corpus: The corpus list is our small-scale version of the text data mentioned in Step 1.\n\n    Strategy & Training: tf.keras.layers.TextVectorization acts as our word-based tokenizer. The .adapt(corpus) method performs Step 2 and 3,\n     automatically learning the vocabulary from the data.\n\n    Special Tokens: When you inspect the vocabulary, you'll see it automatically includes '' (for padding, which maps to 0) and [UNK]\n    (for unknown words, which maps to 1). This corresponds to Step 4.\n\n    Final Tokenizer: The text_vectorizer object is our finalized tokenizer, ready to perform the encoding and decoding tasks described in Step 5.\n\n"

In [2]:
import tensorflow as tf
import numpy as np

# --- Step 1: Gather and Prepare a Corpus ---
# For this demo, our "corpus" is just a small list of sentences.
# In a real project, this would be thousands or millions of sentences from your dataset.
corpus = [
    "the cat sat on the mat",
    "the dog ate my homework",
    "the cat and the dog are friends"
]

# --- Step 2 & 3: Choose a Strategy and Build the Vocabulary ---
# We will use a simple word-based strategy. The TextVectorization layer is a
# convenient tool that handles the entire vocabulary creation process.

# Define the maximum number of words to include in the vocabulary.
# The layer will automatically pick the most frequent words.
vocab_size = 15

# Create the TextVectorization layer. This is our tokenizer.
# It will handle normalization (like lowercasing) and splitting text into words.
# `output_sequence_length` pads or truncates sentences to a fixed length.
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size,
    output_sequence_length=8
)

# Train the tokenizer on our corpus to build the vocabulary.
# The .adapt() method reads the corpus, counts word frequencies,
# and creates the mapping from words to integer IDs.
print("--- Training the tokenizer... ---")
text_vectorizer.adapt(corpus)
print("Vocabulary built successfully!")
print("-" * 30)


# --- Step 4: Inspect the Vocabulary (and Special Tokens) ---
# We can now view the vocabulary that the layer has learned.
# The layer automatically handles adding special tokens like [UNK] for
# out-of-vocabulary words and '0' for padding.
vocabulary = text_vectorizer.get_vocabulary()
print(f"Vocabulary Size: {len(vocabulary)}")
print("Learned Vocabulary (Word -> ID):")
# Print the first 10 words and their corresponding IDs (indices).
for i, word in enumerate(vocabulary[:10]):
    print(f"{i}: {word}")
print("-" * 30)


# --- Step 5: Finalize and Use the Tokenizer ---
# Our tokenizer is now ready to use!

# --- Demonstration: Encoding (Text to Numbers) ---
sentence_to_encode = "the dog and cat are good friends"
print(f"Original sentence: '{sentence_to_encode}'")

# Use the trained tokenizer to convert the sentence into a sequence of integer IDs.
# Note that 'good' is not in our original corpus, so it will be mapped to the [UNK] token (ID 1).
encoded_sentence = text_vectorizer([sentence_to_encode])
print(f"Encoded sequence: {encoded_sentence.numpy()}")
print("-" * 30)


sentence_to_encode = "the cat sat on the mat"
print(f"Original sentence: '{sentence_to_encode}'")

# Use the trained tokenizer to convert the sentence into a sequence of integer IDs.
# Note that 'good' is not in our original corpus, so it will be mapped to the [UNK] token (ID 1).
encoded_sentence = text_vectorizer([sentence_to_encode])
print(f"Encoded sequence: {encoded_sentence.numpy()}")
print("-" * 30)


sentence_to_encode = "the dog ate my homework"
print(f"Original sentence: '{sentence_to_encode}'")

# Use the trained tokenizer to convert the sentence into a sequence of integer IDs.
# Note that 'good' is not in our original corpus, so it will be mapped to the [UNK] token (ID 1).
encoded_sentence = text_vectorizer([sentence_to_encode])
print(f"Encoded sequence: {encoded_sentence.numpy()}")
print("-" * 30)


sentence_to_encode = "the cat and the dog are friends"
print(f"Original sentence: '{sentence_to_encode}'")

# Use the trained tokenizer to convert the sentence into a sequence of integer IDs.
# Note that 'good' is not in our original corpus, so it will be mapped to the [UNK] token (ID 1).
encoded_sentence = text_vectorizer([sentence_to_encode])
print(f"Encoded sequence: {encoded_sentence.numpy()}")
print("-" * 30)



# --- Demonstration: Decoding (Numbers to Text) ---
# We can also build a simple decoder to convert the numbers back to text.
# First, create a reverse mapping from ID to word.
id_to_word_map = {i: word for i, word in enumerate(vocabulary)}

encoded_sequence_to_decode = encoded_sentence.numpy()[0]
print(f"Sequence to decode: {encoded_sequence_to_decode}")

# Decode the sequence by looking up each ID in our map.
# We'll ignore padding tokens (ID 0).
decoded_sentence = ' '.join(id_to_word_map[i] for i in encoded_sequence_to_decode if i > 0)
print(f"Decoded sentence: '{decoded_sentence}'")



--- Training the tokenizer... ---
Vocabulary built successfully!
------------------------------
Vocabulary Size: 14
Learned Vocabulary (Word -> ID):
0: 
1: [UNK]
2: the
3: dog
4: cat
5: sat
6: on
7: my
8: mat
9: homework
------------------------------
Original sentence: 'the dog and cat are good friends'
Encoded sequence: [[ 2  3 13  4 12  1 10  0]]
------------------------------
Original sentence: 'the cat sat on the mat'
Encoded sequence: [[2 4 5 6 2 8 0 0]]
------------------------------
Original sentence: 'the dog ate my homework'
Encoded sequence: [[ 2  3 11  7  9  0  0  0]]
------------------------------
Original sentence: 'the cat and the dog are friends'
Encoded sequence: [[ 2  4 13  2  3 12 10  0]]
------------------------------
Sequence to decode: [ 2  4 13  2  3 12 10  0]
Decoded sentence: 'the cat and the dog are friends'


In [3]:
'''
Obtaining word embeddings using the Keras `Embedding` layer is like using a **smart dictionary**.
Instead of looking up a word to get its definition, look up a word's unique ID number to get a dense vector of
numbers that represents its meaning.

The `Embedding` layer is essentially a **lookup table** that you create and train.
It stores one vector for every word in your vocabulary.

The Three-Step Process

1. Step 1: Prepare Your Data (Text to Integers)

First, convert raw text into sequences of integer IDs. Each unique word in your entire dataset is assigned
a unique integer.

For example, the sentence:
`"the cat sat on the mat"`

Becomes a sequence of integers:
`[2 4 5 6 2 8 0 0]`

2. Step 2: Define the `Embedding` Layer 📖

When you create the layer, you define two key parameters:

1.  input_dim: This is the size of your vocabulary (the total number of unique words). In our example, it would be 14 (12 words + 1 for a '0' padding token + 1 for [UNK]).
2.  output_dim: This is the size of the dense vector you want for each word. This is a hyperparameter you choose. A common size is 128, 256, or 512.


```python
# Let's say we want a 20-dimensional vector for each word.
embedding_layer = tf.keras.layers.Embedding(input_dim=14, output_dim=20)
```

Behind the scenes, Keras creates a simple but powerful weight matrix (our "lookup table") of shape `(input_dim, output_dim)`.
For our example, this would be a (14, 20) matrix. Initially, this matrix is filled with small random numbers, then gets adjusted during training.

3. Step 3: The Lookup Operation 🔍

When you pass your integer sequence [2 4 5 6 2 8 0 0] into the layer, it performs a direct lookup.

  * For the integer 2, it grabs the corresponding vector of the matrix.
  * For the integer 4, it grabs the corresponding vector of the matrix.
  * And so on...

The output is a new sequence where each integer ID has been replaced by its corresponding dense vector from the lookup table.

The Magic: The most important part is that these vectors are LEARNED during training.
Through backpropagation, the model adjusts the values in these vectors. As a result, words that are used in similar contexts
(e.g., "cat" and "dog," or "king" and "queen") will end up having similar-looking vectors.
This is how the model captures the semantic meaning of words.


'''

'\nObtaining word embeddings using the Keras `Embedding` layer is like using a **smart dictionary**.\nInstead of looking up a word to get its definition, look up a word\'s unique ID number to get a dense vector of\nnumbers that represents its meaning.\n\nThe `Embedding` layer is essentially a **lookup table** that you create and train.\nIt stores one vector for every word in your vocabulary.\n\nThe Three-Step Process\n\n1. Step 1: Prepare Your Data (Text to Integers)\n\nFirst, convert raw text into sequences of integer IDs. Each unique word in your entire dataset is assigned\na unique integer.\n\nFor example, the sentence:\n`"the cat sat on the mat"`\n\nBecomes a sequence of integers:\n`[2 4 5 6 2 8 0 0]`\n\n2. Step 2: Define the `Embedding` Layer 📖\n\nWhen you create the layer, you define two key parameters:\n\n1.  input_dim: This is the size of your vocabulary (the total number of unique words). In our example, it would be 14 (12 words + 1 for a \'0\' padding token + 1 for [UNK]).\n2

In [8]:
import tensorflow as tf
import numpy as np

# --- Step 1: Reuse the Tokenizer Developed Previously ---
# We'll start with the same corpus and the trained TextVectorization layer
# from the previous example.

corpus = [
    "the cat sat on the mat",
    "the dog ate my homework",
    "the cat and the dog are friends"
]

vocab_size = 15
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size,
    output_sequence_length=8
)
# Train the tokenizer on our corpus to build the vocabulary.
text_vectorizer.adapt(corpus)


# --- Step 2: Build a Model to Learn Embeddings ---
# We'll create a simple Keras model. The key is that the first layer is our
# tokenizer, and the second is the Embedding layer.

# Define the dimensionality of the word embeddings we want to learn.
embedding_dim = 20

model = tf.keras.Sequential([
    # 1. The TextVectorization layer: This layer takes raw text strings as input
    #    and outputs integer sequences. It's the bridge from text to numbers.
    text_vectorizer,

    # 2. The Embedding layer: This layer takes the integer sequences and looks up
    #    the corresponding embedding vector for each token. The `input_dim` must
    #    match the vocabulary size from our tokenizer.
    tf.keras.layers.Embedding(
        input_dim=len(text_vectorizer.get_vocabulary()),
        output_dim=embedding_dim,
        name="embedding_layer" # Give the layer a name to easily access it later
    ),

    # 3. A Pooling layer: To get a single vector representation for the whole
    #    sentence, we average the embeddings of all words in the sequence.
    tf.keras.layers.GlobalAveragePooling1D(),

    # 4. A final Dense layer: To make this a trainable model, we add a simple
    #    output layer. For a real task, this would be your classification or
    #    regression output.
    tf.keras.layers.Dense(1, activation='sigmoid') # Example for binary classification
])

# Compile the model to prepare it for training
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Explicitly build the model so we can inspect the weights
# The input_shape (None,) indicates a batch of strings.
model.build(input_shape=(None,))

# Print the model summary to see the architecture
print("--- Model Architecture ---")
model.summary()
print("-" * 30)


# --- Step 3: Inspect the Learned Embeddings ---
# Although we haven't trained the model on any data yet, the embedding layer
# already has a randomly initialized weight matrix. After training, these weights
# would contain the meaningful learned embeddings.

# Get the embedding layer from the model by its name
embedding_layer = model.get_layer('embedding_layer')

print(embedding_layer)

# The weights are a list, where the first element is the embedding matrix
embedding_weights = embedding_layer.get_weights()[0]

# Get the vocabulary from our tokenizer
vocabulary = text_vectorizer.get_vocabulary()

print("\n--- Initial Embedding Weights (before training) ---")
print(f"Shape of embedding matrix: {embedding_weights.shape}")
print("(Rows = Vocabulary Size, Columns = Embedding Dimension)")
print("\nEmbedding vector for a specific word:")
# Let's find the ID for the word 'dog'
word_to_find = 'dog'
word_id = vocabulary.index(word_to_find)
# The embedding vector is the row in the weight matrix corresponding to the word's ID
word_vector = embedding_weights[word_id]
print(f"Word: '{word_to_find}' (ID: {word_id})")
print(f"Initial Vector: {word_vector}")
print("\nAfter training on a real task, this vector would capture the 'meaning' of the word 'dog'.")



--- Model Architecture ---


------------------------------
<Embedding name=embedding_layer, built=True>

--- Initial Embedding Weights (before training) ---
Shape of embedding matrix: (14, 20)
(Rows = Vocabulary Size, Columns = Embedding Dimension)

Embedding vector for a specific word:
Word: 'dog' (ID: 3)
Initial Vector: [-0.0102998  -0.04280004 -0.02198629 -0.03258574 -0.0298155   0.01752689
  0.02663679 -0.00136243 -0.04051492 -0.01391686 -0.01206525  0.01693368
  0.02741503  0.00681318  0.00703589 -0.03886237 -0.03923808 -0.04380064
 -0.04333847  0.03581171]

After training on a real task, this vector would capture the 'meaning' of the word 'dog'.
