In [151]:
# Preprocess a text file

import string
import re

f=open("shakespeare.txt",encoding="utf8")
text=f.read()
text=text.lower()
words = re.split(r'\W+', text)
tbl = str.maketrans('','', string.punctuation)
clean = [w.translate(tbl) for w in words]
text = " ".join(clean)

print(text[500:1000]) # inspect

ander helena in love with demetrius lysander in love with hermia demetrius in love with hermia philostrate master of the revels to theseus quince the carpenter snug the joiner bottom the weaver flute the bellows mender snout the tinker starveling the tailor oberon king of the fairies titania queen of the fairies puck or robin goodfellow a fairy peaseblossom fairy cobweb fairy moth fairy mustardseed fairy pyramus thisbe wall moonshine lion characters in the interlude performed by the clowns other


In [152]:
# Get characters from text

chars = tuple(sorted(set(text)))
char_int = dict([(c,i) for i,c in enumerate(chars)])
int_char = dict(enumerate(chars)) # (Invert) model ints back to characters
print(chars) # inspect

(' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'æ', 'è')


In [153]:
# 1. One-hot encoding with No dependencies

one_hot_text = []
n = len(chars)

int_text = [char_int[char] for char in text]

for i in int_text:
    a_hot = [0]*n
    a_hot[i] = 1
    one_hot_text.append(a_hot)

print(one_hot_text[:5]) # inspect
print(text[:5])

[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
 act 


In [154]:
# 2. One-hot encoding with Sklearn and numpy

from sklearn.preprocessing import OneHotEncoder as encoder
from numpy import asarray

listchars = [list(i) for i in list(text)]
one_hot_text = encoder(sparse=False).fit_transform(listchars)

print(one_hot_text[:5]) # inspect

[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]]


In [155]:
# 3. One-hot encoding with Pytorch one_hot

import torch.nn.functional

x=torch.tensor(list(char_int.values()))
one_hot_text = torch.nn.functional.one_hot(torch.tensor(int_text))
one_hot_text.cpu().detach().numpy()

print(np.array(one_hot_text)[:5]) # inspect


[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [203]:
# 4. One-hot encoding with Pytorch scatter

batch_size = len(int_text)

int_text_tensor = torch.tensor(int_text).view(batch_size,1)

one_hot_text = torch.zeros(batch_size,len(chars))   # preallocate
one_hot_text.scatter_(1,int_text_tensor,1)          # fill one hot

print(np.array(one_hot_text)[:5]) # inspect

[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]]
