### Sklearn

In [1]:
from sklearn.preprocessing import OneHotEncoder
import itertools

In [2]:
# two example documents
docs = ["cat", "dog", "bat", "ate"]

In [5]:
# split documents to tokens
tokens_docs = [doc.split(" ") for doc in docs]
print(tokens_docs)

[['cat'], ['dog'], ['bat'], ['ate']]


In [6]:
# convert list of token-lists to one flat list of tokens
# and then create a dictionary that maps word to id of word,
all_tokens = itertools.chain.from_iterable(tokens_docs)
word_to_id = {token: idx for idx, token in enumerate(set(all_tokens))}

In [7]:
word_to_id

{'ate': 0, 'bat': 1, 'cat': 2, 'dog': 3}

In [11]:
# convert token lists to token-id lists
token_ids = [[word_to_id[token] for token in tokens_doc] for tokens_doc in tokens_docs]
print(token_ids)

[[2], [3], [1], [0]]


In [14]:
# convert list of token-id lists to one-hot representative

vec = OneHotEncoder()
x = vec.fit_transform(token_ids)
print(x)
print(x.toarray())

  (0, 2)	1.0
  (1, 3)	1.0
  (2, 1)	1.0
  (3, 0)	1.0
[[0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]


### Sklearn CountVectorizer

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

text = ["i love nlp. nlp is so cool"]

vectorizer = CountVectorizer()

In [17]:
# Tokenize and built vocabulary

vectorizer.fit(text)
print(vectorizer.vocabulary_)

{'love': 2, 'nlp': 3, 'is': 1, 'so': 4, 'cool': 0}


In [18]:
# encode document

vector = vectorizer.transform(text)

In [20]:
# Summarize encoded vector

print(vector.shape)
print(vector.toarray())

(1, 5)
[[1 1 1 2 1]]


### TfidVectorizer

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

text1 = ['i love nlp', 'nlp is so cool', 
'nlp is all about helping machines process language', 
'this tutorial is on baisc nlp technique']

In [24]:
tf = TfidfVectorizer()
txt_fitted = tf.fit(text1)
txt_transformed = txt_fitted.transform(text1)
print ("The text: ", text1)

The text:  ['i love nlp', 'nlp is so cool', 'nlp is all about helping machines process language', 'this tutorial is on baisc nlp technique']


In [25]:
idf = tf.idf_
print(dict(zip(txt_fitted.get_feature_names(), idf)))

{'about': 1.916290731874155, 'all': 1.916290731874155, 'baisc': 1.916290731874155, 'cool': 1.916290731874155, 'helping': 1.916290731874155, 'is': 1.2231435513142097, 'language': 1.916290731874155, 'love': 1.916290731874155, 'machines': 1.916290731874155, 'nlp': 1.0, 'on': 1.916290731874155, 'process': 1.916290731874155, 'so': 1.916290731874155, 'technique': 1.916290731874155, 'this': 1.916290731874155, 'tutorial': 1.916290731874155}


