In [1]:
# One Hot Encoding

## With NLTK

In [26]:
import nltk
import string

def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()
    
    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)

def vectorize(doc):
    return {
        token: True 
        for token in tokenize(doc)
    }

corpus = [
    'The elephant sneezed at the sight of potatoes.',
    'Bats can see via echolocation. See the bat sight sneeze!',
    'Wondering, she opened the door to the studio.'
]

vectors = map(vectorize, corpus)
list(vectors)

[{'the': True,
  'eleph': True,
  'sneez': True,
  'at': True,
  'sight': True,
  'of': True,
  'potato': True},
 {'bat': True,
  'can': True,
  'see': True,
  'via': True,
  'echoloc': True,
  'the': True,
  'sight': True,
  'sneez': True},
 {'wonder': True,
  'she': True,
  'open': True,
  'the': True,
  'door': True,
  'to': True,
  'studio': True}]

## With Scikit Learn

In [27]:
from sklearn.preprocessing import Binarizer
from sklearn.feature_extraction.text import CountVectorizer

freq = CountVectorizer()
corpus = freq.fit_transform(corpus)
onehot = Binarizer()
corpus = onehot.fit_transform(corpus.toarray())
corpus

array([[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1]])

## With Gensim

In [None]:
import sys

!{sys.executable} -m pip install gensim

In [40]:
import gensim 

corpus = [
    'The elephant sneezed at the sight of potatoes.',
    'Bats can see via echolocation. See the bat sight sneeze!',
    'Wondering, she opened the door to the studio.'
]
corpus = [list(tokenize(doc)) for doc in corpus]
id2word = gensim.corpora.Dictionary(corpus)
vectors = [
    [(token[0], 1) for token in id2word.doc2bow(doc)]
    for doc in corpus
]
vectors

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)],
 [(6, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]]