<h1 style="color:DodgerBlue; text-align:center; font-weight:bold; font-size:50px; background-color:lightblue; padding:20px 20px">One-Hot Encoding Representation of Text</h1>

In [1]:
# import requirments

import numpy as np
import pandas as pd

import string
from nltk.tokenize import word_tokenize

In [2]:
# load data or list of string
sentences = [
    "The cat sat on the mat.",
    "The dog barked at the cat.",
    "The cat and the dog are friends.",
    "Birds can fly high in the sky."
]

sentences

['The cat sat on the mat.',
 'The dog barked at the cat.',
 'The cat and the dog are friends.',
 'Birds can fly high in the sky.']

```python

# Tokenize, normalize to lower case, and remove punctuation
import string
tokens = [
    [word for word in word_tokenize(sentence.lower()) if word not in string.punctuation]
    for sentence in sentences
]
tokens

In [3]:
# convert the sentence list into tokens
sentence_tokens = [
    [word for word in word_tokenize(sentence.lower()) if word not in string.punctuation]
    for sentence in sentences
]

sentence_tokens

[['the', 'cat', 'sat', 'on', 'the', 'mat'],
 ['the', 'dog', 'barked', 'at', 'the', 'cat'],
 ['the', 'cat', 'and', 'the', 'dog', 'are', 'friends'],
 ['birds', 'can', 'fly', 'high', 'in', 'the', 'sky']]

In [4]:
# get all unique word list

unique_words = sorted(set([word for sent in sentence_tokens for word in sent]))
print(unique_words)

['and', 'are', 'at', 'barked', 'birds', 'can', 'cat', 'dog', 'fly', 'friends', 'high', 'in', 'mat', 'on', 'sat', 'sky', 'the']


In [5]:
# unique word to an index
word_to_index = {}
for i, word in enumerate(unique_words):
    word_to_index[word] = i

word_to_index

{'and': 0,
 'are': 1,
 'at': 2,
 'barked': 3,
 'birds': 4,
 'can': 5,
 'cat': 6,
 'dog': 7,
 'fly': 8,
 'friends': 9,
 'high': 10,
 'in': 11,
 'mat': 12,
 'on': 13,
 'sat': 14,
 'sky': 15,
 'the': 16}

In [6]:
# Create the one-hot encoded representation
one_hot_encoded = []
for sent in sentence_tokens:
    one_hot_vector = np.zeros(len(unique_words) ,dtype='int')
    for word in sent:
        if word in word_to_index:
            one_hot_vector[word_to_index[word]] = 1
    one_hot_encoded.append(one_hot_vector)

one_hot_encoded

[array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1]),
 array([0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1]),
 array([1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1]),
 array([0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1])]

In [7]:
# print a dataframe of the one-hot encode

pd.DataFrame(one_hot_encoded, columns=unique_words ,index=sentences)

Unnamed: 0,and,are,at,barked,birds,can,cat,dog,fly,friends,high,in,mat,on,sat,sky,the
The cat sat on the mat.,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,1
The dog barked at the cat.,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,1
The cat and the dog are friends.,1,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1
Birds can fly high in the sky.,0,0,0,0,1,1,0,0,1,0,1,1,0,0,0,1,1


        _____________________________________________ End _____________________________________________