<a href="https://colab.research.google.com/github/bhargav23/AIML-DL-Lab/blob/main/8_One_Hot_Encoding_in_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**8. Implement one hot encoding of words or characters.**

In [1]:
import numpy as np

def one_hot_encode_words(text):
    """
    Performs one-hot encoding on words in a given text.

    Args:
        text (str): The input string to be encoded.

    Returns:
        tuple: A tuple containing:
            - dict: A dictionary mapping each word to its one-hot encoded vector.
            - dict: The vocabulary mapping each word to its index.
            - np.ndarray: A numpy array of all the one-hot encoded vectors.
    """
    # 1. Tokenize the text into words (and convert to lowercase)
    words = text.lower().split()

    # 2. Create a vocabulary of unique words
    # Using a set gets unique words, then converting to a list to maintain order
    unique_words = sorted(list(set(words)))

    # 3. Create a mapping from word to integer index
    word_to_int = {word: i for i, word in enumerate(unique_words)}

    # 4. Create a mapping from integer index to word (for reference)
    int_to_word = {i: word for i, word in enumerate(unique_words)}

    # 5. Get the size of the vocabulary
    vocab_size = len(unique_words)
    print(f"Vocabulary Size (Words): {vocab_size}")
    print(f"Word to Index Mapping: {word_to_int}\n")

    # 6. Generate one-hot vectors
    one_hot_encoded_vectors = {}
    all_vectors = []
    for word in words:
        # Create a vector of zeros with length equal to vocabulary size
        vector = np.zeros(vocab_size, dtype=int)
        # Get the index of the current word
        word_index = word_to_int[word]
        # Set the element at that index to 1
        vector[word_index] = 1
        one_hot_encoded_vectors[word] = vector
        all_vectors.append(vector)

    return one_hot_encoded_vectors, word_to_int, np.array(all_vectors)

In [2]:
# --- 1. One-Hot Encoding for Words ---
print("--- Word One-Hot Encoding Example ---")
sentence = "The cat sat on the mat"
word_vectors, word_vocab, word_array = one_hot_encode_words(sentence)


--- Word One-Hot Encoding Example ---
Vocabulary Size (Words): 5
Word to Index Mapping: {'cat': 0, 'mat': 1, 'on': 2, 'sat': 3, 'the': 4}



In [3]:
print("Original Sentence:", sentence)
for word, vector in word_vectors.items():
    print(f"'{word}': {vector}")

Original Sentence: The cat sat on the mat
'the': [0 0 0 0 1]
'cat': [1 0 0 0 0]
'sat': [0 0 0 1 0]
'on': [0 0 1 0 0]
'mat': [0 1 0 0 0]


In [4]:
# You can also see the full sequence of vectors
print("\nEncoded sequence of vectors for the sentence:")
print(word_array)
print("-" * 40 + "\n")


Encoded sequence of vectors for the sentence:
[[0 0 0 0 1]
 [1 0 0 0 0]
 [0 0 0 1 0]
 [0 0 1 0 0]
 [0 0 0 0 1]
 [0 1 0 0 0]]
----------------------------------------



In [5]:
def one_hot_encode_characters(text):
    """
    Performs one-hot encoding on characters in a given text.

    Args:
        text (str): The input string to be encoded.

    Returns:
        tuple: A tuple containing:
            - dict: A dictionary mapping each character to its one-hot encoded vector.
            - dict: The vocabulary mapping each character to its index.
            - np.ndarray: A numpy array of all the one-hot encoded vectors.
    """
    # 1. Get the unique characters from the text
    unique_chars = sorted(list(set(text)))

    # 2. Create a mapping from character to integer index
    char_to_int = {char: i for i, char in enumerate(unique_chars)}

    # 3. Create a mapping from integer index to character (for reference)
    int_to_char = {i: char for i, char in enumerate(unique_chars)}

    # 4. Get the size of the vocabulary
    vocab_size = len(unique_chars)
    print(f"Vocabulary Size (Characters): {vocab_size}")
    print(f"Character to Index Mapping: {char_to_int}\n")

    # 5. Generate one-hot vectors
    one_hot_encoded_vectors = {}
    all_vectors = []
    for char in text:
        # Create a vector of zeros
        vector = np.zeros(vocab_size, dtype=int)
        # Get the index of the current character
        char_index = char_to_int[char]
        # Set the element at that index to 1
        vector[char_index] = 1
        one_hot_encoded_vectors[char] = vector
        all_vectors.append(vector)

    return one_hot_encoded_vectors, char_to_int, np.array(all_vectors)

In [6]:
# --- 2. One-Hot Encoding for Characters ---
print("--- Character One-Hot Encoding Example ---")
simple_word = "hello"
char_vectors, char_vocab, char_array = one_hot_encode_characters(simple_word)

--- Character One-Hot Encoding Example ---
Vocabulary Size (Characters): 4
Character to Index Mapping: {'e': 0, 'h': 1, 'l': 2, 'o': 3}



In [7]:
print("Original Word:", simple_word)
# We print the vectors for unique characters to avoid repetition
unique_chars_in_word = sorted(list(set(simple_word)))
for char in unique_chars_in_word:
    print(f"'{char}': {char_vectors[char]}")

Original Word: hello
'e': [1 0 0 0]
'h': [0 1 0 0]
'l': [0 0 1 0]
'o': [0 0 0 1]


In [8]:
print("\nEncoded sequence of vectors for the word:")
print(char_array)
print("-" * 40)


Encoded sequence of vectors for the word:
[[0 1 0 0]
 [1 0 0 0]
 [0 0 1 0]
 [0 0 1 0]
 [0 0 0 1]]
----------------------------------------
