
# Hands-On Exercise: Word Embedding using One-Hot Encoding

In [1]:


# Step 1: Define the corpus
corpus = [
    'I love natural language processing',
    'Natural language processing is amazing',
    'I love learning new techniques',
    'Techniques in natural language processing are evolving',
    'NIC Experts are learning Natural Language Processing'
]



In [2]:
# Step 2: Create a Vocabulary
# Convert to lowercase and split each sentence into words
vocab = set([word.lower() for sentence in corpus for word in sentence.split()])

# Assign an index to each word
word_to_index = {word: index for index, word in enumerate(vocab)}
print("Vocabulary:", word_to_index)



Vocabulary: {'techniques': 0, 'evolving': 1, 'i': 2, 'natural': 3, 'amazing': 4, 'processing': 5, 'love': 6, 'learning': 7, 'is': 8, 'new': 9, 'in': 10, 'are': 11, 'experts': 12, 'language': 13, 'nic': 14}


In [3]:
# Step 3: Implement One-Hot Encoding
import numpy as np

# Function to create a one-hot vector for a given word
def one_hot_encode(word, word_to_index, vocab_size):
    one_hot_vector = np.zeros(vocab_size)
    one_hot_vector[word_to_index[word]] = 1
    return one_hot_vector



In [4]:
# One-hot encode the first sentence
sentence = 'I love natural language processing'.lower().split()
vocab_size = len(vocab)

for word in sentence:
    one_hot_vector = one_hot_encode(word, word_to_index, vocab_size)
    print(f"Word: {word}, One-Hot: {one_hot_vector}")




Word: i, One-Hot: [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Word: love, One-Hot: [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
Word: natural, One-Hot: [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Word: language, One-Hot: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
Word: processing, One-Hot: [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [5]:
# Step 4: Encode Entire Corpus
def encode_sentence(sentence, word_to_index, vocab_size):
    encoded_sentence = []
    for word in sentence.lower().split():
        encoded_sentence.append(one_hot_encode(word, word_to_index, vocab_size))
    return np.array(encoded_sentence)



In [6]:
print (vocab)

{'techniques', 'evolving', 'i', 'natural', 'amazing', 'processing', 'love', 'learning', 'is', 'new', 'in', 'are', 'experts', 'language', 'nic'}


In [7]:
# One-hot encode each sentence in the corpus
for sentence in corpus:
    encoded_matrix = encode_sentence(sentence, word_to_index, vocab_size)
    print(f"Sentence: {sentence}")
    print("One-Hot Encoded Matrix:")
    print(encoded_matrix)



Sentence: I love natural language processing
One-Hot Encoded Matrix:
[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Sentence: Natural language processing is amazing
One-Hot Encoded Matrix:
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Sentence: I love learning new techniques
One-Hot Encoded Matrix:
[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Sentence: Techniques in natural language processing are evolving
One-H

## Limitations of One-Hot Encoding:
### 1. Dimensionality: Each word in a large corpus will have a very sparse vector, leading to inefficient storage and computation.
### 2. Semantic Information: One-hot encoding does not capture any information about the relationship between words. Words with similar meanings have orthogonal vectors, which does not reflect their semantic similarity.
