In [16]:
# Based on code at: https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/

In [17]:
# Bit vector encoded
def do_integer_encoding(data):
    # define universe of possible input values
    alphabet = 'abcdefghijklmnopqrstuvwxyz .'
    # define a mapping of chars to integers
    char_to_int = dict((c, i) for i, c in enumerate(alphabet))
    int_to_char = dict((i, c) for i, c in enumerate(alphabet))
    # integer encoded input data
    integer_encoded = [char_to_int[char] for char in data]
    return integer_encoded

In [18]:
small_data = "hello world"
enc = do_integer_encoding(small_data)
print ("data = " + small_data + ", enc = " + str(enc))

data = hello world, enc = [7, 4, 11, 11, 14, 26, 22, 14, 17, 11, 3]


In [19]:
# Incorrect vectors - lowecases and uppercase
medium_data = "This is an important document. It contains the contract governing \
your deposit relationship with the Bank and required legal \
disclosures. Please have it translated. "
enc = do_integer_encoding(medium_data.lower())
print ("data = " + medium_data + ", enc = " + str(enc))

data = This is an important document. It contains the contract governing your deposit relationship with the Bank and required legal disclosures. Please have it translated. , enc = [19, 7, 8, 18, 26, 8, 18, 26, 0, 13, 26, 8, 12, 15, 14, 17, 19, 0, 13, 19, 26, 3, 14, 2, 20, 12, 4, 13, 19, 27, 26, 8, 19, 26, 2, 14, 13, 19, 0, 8, 13, 18, 26, 19, 7, 4, 26, 2, 14, 13, 19, 17, 0, 2, 19, 26, 6, 14, 21, 4, 17, 13, 8, 13, 6, 26, 24, 14, 20, 17, 26, 3, 4, 15, 14, 18, 8, 19, 26, 17, 4, 11, 0, 19, 8, 14, 13, 18, 7, 8, 15, 26, 22, 8, 19, 7, 26, 19, 7, 4, 26, 1, 0, 13, 10, 26, 0, 13, 3, 26, 17, 4, 16, 20, 8, 17, 4, 3, 26, 11, 4, 6, 0, 11, 26, 3, 8, 18, 2, 11, 14, 18, 20, 17, 4, 18, 27, 26, 15, 11, 4, 0, 18, 4, 26, 7, 0, 21, 4, 26, 8, 19, 26, 19, 17, 0, 13, 18, 11, 0, 19, 4, 3, 27, 26]


In [22]:
# Tokenize larger data for ease
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(medium_data)
values = array([token.text for token in doc])
print (values)

['This' 'is' 'an' 'important' 'document' '.' 'It' 'contains' 'the'
 'contract' 'governing' 'your' 'deposit' 'relationship' 'with' 'the'
 'Bank' 'and' 'required' 'legal' 'disclosures' '.' 'Please' 'have' 'it'
 'translated' '.']


In [23]:
# Scikit encoding
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [24]:
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

[ 4 15  5 14 11  0  2  7 20  8 12 23  9 18 22 20  1  6 19 17 10  0  3 13
 16 21  0]


In [25]:
# integer decode
inverted = label_encoder.inverse_transform(integer_encoded)
print(inverted)

['This' 'is' 'an' 'important' 'document' '.' 'It' 'contains' 'the'
 'contract' 'governing' 'your' 'deposit' 'relationship' 'with' 'the'
 'Bank' 'and' 'required' 'legal' 'disclosures' '.' 'Please' 'have' 'it'
 'translated' '.']


In [26]:
# binary encode .. OneHot
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.

In [29]:
# invert OneHot
# Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
inverted = onehot_encoder.inverse_transform(onehot_encoded)
print(inverted)

[[ 4]
 [15]
 [ 5]
 [14]
 [11]
 [ 0]
 [ 2]
 [ 7]
 [20]
 [ 8]
 [12]
 [23]
 [ 9]
 [18]
 [22]
 [20]
 [ 1]
 [ 6]
 [19]
 [17]
 [10]
 [ 0]
 [ 3]
 [13]
 [16]
 [21]
 [ 0]]
