In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [2]:
corpus = [
    'This is the first first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
    "This is a document"
]

In [3]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out()) # a is ignored
print(X) # sparse
print(X.toarray()) # dense
print(vectorizer.stop_words_) # a is not in stop words
print(vectorizer.vocabulary_)

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
  (0, 8)	1
  (0, 3)	1
  (0, 6)	1
  (0, 2)	2
  (0, 1)	1
  (1, 8)	1
  (1, 3)	1
  (1, 6)	1
  (1, 1)	2
  (1, 5)	1
  (2, 8)	1
  (2, 3)	1
  (2, 6)	1
  (2, 0)	1
  (2, 7)	1
  (2, 4)	1
  (3, 8)	1
  (3, 3)	1
  (3, 6)	1
  (3, 2)	1
  (3, 1)	1
  (4, 8)	1
  (4, 3)	1
  (4, 1)	1
[[0 1 2 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 0 0 0 1]]
set()
{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}


In [4]:
# n-grams
vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out()) # a is ignored
print(X) # sparse
print(X.toarray()) # dense
print(vectorizer.stop_words_) # a is not in stop words
print(vectorizer.vocabulary_)

['and this' 'document is' 'first document' 'first first' 'is document'
 'is the' 'is this' 'second document' 'the first' 'the second' 'the third'
 'third one' 'this document' 'this is' 'this the']
  (0, 13)	1
  (0, 5)	1
  (0, 8)	1
  (0, 3)	1
  (0, 2)	1
  (1, 5)	1
  (1, 12)	1
  (1, 1)	1
  (1, 9)	1
  (1, 7)	1
  (2, 13)	1
  (2, 5)	1
  (2, 0)	1
  (2, 10)	1
  (2, 11)	1
  (3, 8)	1
  (3, 2)	1
  (3, 6)	1
  (3, 14)	1
  (4, 13)	1
  (4, 4)	1
[[0 0 1 1 0 1 0 0 1 0 0 0 0 1 0]
 [0 1 0 0 0 1 0 1 0 1 0 0 1 0 0]
 [1 0 0 0 0 1 0 0 0 0 1 1 0 1 0]
 [0 0 1 0 0 0 1 0 1 0 0 0 0 0 1]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 1 0]]
set()
{'this is': 13, 'is the': 5, 'the first': 8, 'first first': 3, 'first document': 2, 'this document': 12, 'document is': 1, 'the second': 9, 'second document': 7, 'and this': 0, 'the third': 10, 'third one': 11, 'is this': 6, 'this the': 14, 'is document': 4}


In [5]:
# CountVectorizer with predefined vocabulary
# n-grams
vectorizer = CountVectorizer(vocabulary=["document", "first", "second", "third"])
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out()) # a is ignored
print(X) # sparse
print(X.toarray()) # dense
# print(vectorizer.stop_words_) # a is not in stop words
print(vectorizer.vocabulary_)

['document' 'first' 'second' 'third']
  (0, 0)	1
  (0, 1)	2
  (1, 0)	2
  (1, 2)	1
  (2, 3)	1
  (3, 0)	1
  (3, 1)	1
  (4, 0)	1
[[1 2 0 0]
 [2 0 1 0]
 [0 0 0 1]
 [1 1 0 0]
 [1 0 0 0]]
{'document': 0, 'first': 1, 'second': 2, 'third': 3}


In [6]:
# TFidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
# print(vectorizer.get_feature_names_out()) # a is ignored
# print(X) # sparse
print(X.toarray()) # dense
# print(vectorizer.stop_words_) # a is not in stop words
print(vectorizer.vocabulary_)


[[0.         0.29318281 0.83970668 0.24797215 0.         0.
  0.29318281 0.         0.24797215]
 [0.         0.64612571 0.         0.2732445  0.         0.57343426
  0.32306286 0.         0.2732445 ]
 [0.51492278 0.         0.         0.24536346 0.51492278 0.
  0.29009851 0.51492278 0.24536346]
 [0.         0.42712001 0.6116585  0.36125537 0.         0.
  0.42712001 0.         0.36125537]
 [0.         0.64140349 0.         0.54249496 0.         0.
  0.         0.         0.54249496]]
{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}


In [7]:
type(print(vectorizer.vocabulary_))

{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}


NoneType

In [8]:
for k, v in vectorizer.vocabulary_:
    print(k)

ValueError: too many values to unpack (expected 2)

In [None]:
# TODO, do csr_matrix manupulation to get top