In [1]:
doc = ['It was the best of times',
       'it was the worst of times']

# CountVectorizer

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [3]:
bow_vec = CountVectorizer(lowercase=True)
bow_vec.fit(doc)
word_counts = bow_vec.transform(doc)

In [4]:
bow_vec.vocabulary_

{'it': 1, 'was': 5, 'the': 3, 'best': 0, 'of': 2, 'times': 4, 'worst': 6}

In [5]:
word_counts.toarray()

array([[1, 1, 1, 1, 1, 1, 0],
       [0, 1, 1, 1, 1, 1, 1]])

In [6]:
df = pd.DataFrame(word_counts.toarray(), columns=sorted(bow_vec.vocabulary_))

In [7]:
df

Unnamed: 0,best,it,of,the,times,was,worst
0,1,1,1,1,1,1,0
1,0,1,1,1,1,1,1


# HashingVectorizer

In [8]:
from sklearn.feature_extraction.text import HashingVectorizer

In [11]:
hash_vec = HashingVectorizer(n_features=15)
hash_vec.fit(doc)
word_counts = hash_vec.transform(doc)

In [12]:
df = pd.DataFrame(word_counts.toarray())

In [87]:
word_counts.toarray()

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        , -0.70710678,  0.        ,  0.        ,  0.        ,
         0.        ,  0.70710678,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        , -0.70710678,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.70710678,  0.        ,  0.        ,  0.        ]])

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
tfidf_vec = TfidfVectorizer()
tfidf_vec.fit(doc)
counts = tfidf_vec.transform(doc)

In [20]:
sorted(tfidf_vec.vocabulary_.keys())

['best', 'it', 'of', 'the', 'times', 'was', 'worst']

In [21]:
df = pd.DataFrame(counts.toarray(), columns=sorted(tfidf_vec.vocabulary_.keys()))

In [22]:
df

Unnamed: 0,best,it,of,the,times,was,worst
0,0.532154,0.378632,0.378632,0.378632,0.378632,0.378632,0.0
1,0.0,0.378632,0.378632,0.378632,0.378632,0.378632,0.532154


In [24]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

In [25]:
tf_hash = Pipeline([
    ('hash vec', HashingVectorizer(n_features=15)),
    ('tfidf', TfidfTransformer())
])

In [29]:
output = tf_hash.fit_transform(doc).toarray()

In [31]:
df = pd.DataFrame(output)

In [32]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,0.0,0.0,0.0,0.0,0.0,-0.707107,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,-0.814802,0.0,0.0,0.0,0.0,0.0,0.0,0.579739,0.0,0.0,0.0


In [13]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,0.0,0.0,0.0,0.0,0.0,-0.707107,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,-0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0


In [35]:
tf_hash.named_steps['tfidf'].idf_

array([2.09861229, 2.09861229, 2.09861229, 1.        , 1.40546511,
       2.09861229, 1.        , 2.09861229, 2.09861229, 2.09861229,
       2.09861229, 1.        , 2.09861229, 2.09861229, 2.09861229])