## CountVectorizer

+ CountVectorizer converts a bunch of documents to a vector so it can be used with models.
+ it basically counts the # of times a particular word has been used

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
docs = ['Brendan is a nice boy', 'Brendan is the man!', 'Brendan is a python expert']
cv = CountVectorizer()
X = cv.fit_transform(docs)
print(X.todense())
print(cv.vocabulary_)

[[1 1 0 1 0 1 0 0]
 [0 1 0 1 1 0 0 1]
 [0 1 1 1 0 0 1 0]]
{'brendan': 1, 'is': 3, 'nice': 5, 'boy': 0, 'the': 7, 'man': 4, 'python': 6, 'expert': 2}


## DictVectorizer
+ DictVectorizer will convert mappings to vectors


In [6]:
from sklearn.feature_extraction import DictVectorizer

docs = [{"Brendan": 1, "is":1, "awesome": 2},{"No": 1, "I": 1, "don't":2, "wanna":3, "fall":1, "in":2, "love":3}]
dv = DictVectorizer()
X = dv.fit_transform(docs)
print(X.todense())

[[1. 0. 0. 2. 0. 0. 0. 1. 0. 0.]
 [0. 1. 1. 0. 2. 1. 2. 0. 3. 3.]]


## TfidfVectorizer

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfidf_vectorizer = TfidfVectorizer()
cv_vectorizer = CountVectorizer()
docs = ['Brendan is Guitarist', "Brendan is Musician", 'Brendan is also a programmer']
X_idf = tfidf_vectorizer.fit_transform(docs)
X_cv = cv_vectorizer.fit_transform(docs)
print(X_idf.todense())
print(tfidf_vectorizer.vocabulary_)
print(X_cv.todense())

[[0.         0.45329466 0.76749457 0.45329466 0.         0.        ]
 [0.         0.45329466 0.         0.45329466 0.76749457 0.        ]
 [0.6088451  0.35959372 0.         0.35959372 0.         0.6088451 ]]
{'brendan': 1, 'is': 3, 'guitarist': 2, 'musician': 4, 'also': 0, 'programmer': 5}
[[0 1 1 1 0 0]
 [0 1 0 1 1 0]
 [1 1 0 1 0 1]]
