# One-Hot Encoding (OHE) in NLP

One-hot encoding is a simple way to represent words as numbers so that computers can understand them. In NLP (Natural Language Processing), each unique word in your text is given its own position in a list. When you want to represent a word, you create a list (or vector) where all the positions are 0 except for the one that matches your word, which is set to 1.

For example, if you have three words: "cat", "dog", and "fish", you can represent "dog" as [0, 1, 0]. This helps computers work with words using math, but it doesn't capture any meaning or relationship between words—just their identity.


In [None]:

# Example dataset: list of sentences
sentences = [
    "I love NLP",
    "NLP is fun",
    "I love coding",
    "Coding is creative"
]

# Use sklearn's OneHotEncoder to encode words in each sentence
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Split sentences into words for fitting the encoder
all_words = sorted(set(word for sentence in sentences for word in sentence.split()))
print("Vocabulary:", all_words)

# Prepare data for OneHotEncoder: each word as a separate sample
words_array = np.array(all_words).reshape(-1, 1)
ohe = OneHotEncoder(sparse=False)
ohe.fit(words_array)

# Create a mapping from word to index for display
word2idx = {word: idx for idx, word in enumerate(ohe.categories_[0])}
print("Word to Index mapping:", word2idx)

# One-hot encode each sentence as a vector (multi-hot: 1 if word present in sentence)
for sentence in sentences:
    words = sentence.split()
    # Transform each word in the sentence
    word_vectors = ohe.transform(np.array(words).reshape(-1, 1))
    # Combine to a single vector: 1 if word present in sentence
    sentence_vector = (word_vectors.sum(axis=0) > 0).astype(int)
    print(f"Sentence: '{sentence}'")
    print("OHE Vector:", sentence_vector)


In [1]:
sentences = [
    "I love NLP",
    "NLP is fun",
    "I love coding",
    "Coding is creative"
]


In [2]:
# Use sklearn's OneHotEncoder to encode words in each sentence
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Split sentences into words for fitting the encoder
all_words = sorted(set(word for sentence in sentences for word in sentence.split()))
print("Vocabulary:", all_words)

Vocabulary: ['Coding', 'I', 'NLP', 'coding', 'creative', 'fun', 'is', 'love']


In [4]:
words_array = np.array(all_words).reshape(-1, 1)
ohe = OneHotEncoder()
ohe.fit(words_array)

In [5]:
word2idx = {word: idx for idx, word in enumerate(ohe.categories_[0])}
print("Word to Index mapping:", word2idx)

Word to Index mapping: {'Coding': 0, 'I': 1, 'NLP': 2, 'coding': 3, 'creative': 4, 'fun': 5, 'is': 6, 'love': 7}


In [6]:
for sentence in sentences:
    words = sentence.split()
    # Transform each word in the sentence
    word_vectors = ohe.transform(np.array(words).reshape(-1, 1))
    # Combine to a single vector: 1 if word present in sentence
    sentence_vector = (word_vectors.sum(axis=0) > 0).astype(int)
    print(f"Sentence: '{sentence}'")
    print("OHE Vector:", sentence_vector)

Sentence: 'I love NLP'
OHE Vector: [[0 1 1 0 0 0 0 1]]
Sentence: 'NLP is fun'
OHE Vector: [[0 0 1 0 0 1 1 0]]
Sentence: 'I love coding'
OHE Vector: [[0 1 0 1 0 0 0 1]]
Sentence: 'Coding is creative'
OHE Vector: [[1 0 0 0 1 0 1 0]]
