# Count Vectorizer

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
document1 = 'Dog hates a cat It loves to go out and play'
document2 = 'Cat loves to play with a ball'

In [2]:
# converting sentences to lower case
document1 = document1.lower()
document2 = document2.lower()

In [3]:
# Intialize BoWs
count_vect = CountVectorizer()
# fit the corpus to CountVectorizer
count_vect.fit([document1, document2])

CountVectorizer()

In [4]:
print("feature names ", count_vect.get_feature_names())

feature names  ['and', 'ball', 'cat', 'dog', 'go', 'hates', 'it', 'loves', 'out', 'play', 'to', 'with']


In [5]:
# bag of word representation of document1
bow1 = count_vect.transform([document1])
print("Representation of document1: ", bow1.toarray())

Representation of document1:  [[1 0 1 1 1 1 1 1 1 1 1 0]]


In [6]:
# bag of word representation of document2
bow2 = count_vect.transform([document2])
print("Representation of document2: ", bow2.toarray())

Representation of document2:  [[0 1 1 0 0 0 0 1 0 1 1 1]]


# TF-IDF Vectorizer

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
document1 = 'Dog hates a cat It loves to go out and play'
document2 = 'Cat loves to play with a ball'

In [8]:
# converting sentences to lower case
document1 = document1.lower()
document2 = document2.lower()

In [9]:
# Intialize TfidfVectorizer
tfidf_vect = TfidfVectorizer()
# fit the corpus to TfidfVectorizer
tfidf_vect.fit([document1, document2])

TfidfVectorizer()

In [10]:
print("feature names ", tfidf_vect.get_feature_names())

feature names  ['and', 'ball', 'cat', 'dog', 'go', 'hates', 'it', 'loves', 'out', 'play', 'to', 'with']


In [11]:
# tf-idf representation of document1
tfidf1 = tfidf_vect.transform([document1])
print("Representation of document1: ", tfidf1.toarray())

Representation of document1:  [[0.35300279 0.         0.25116439 0.35300279 0.35300279 0.35300279
  0.35300279 0.25116439 0.35300279 0.25116439 0.25116439 0.        ]]


In [12]:
# tf-idf representation of document2
tfidf2 = tfidf_vect.transform([document2])
print("Representation of document2: ", tfidf2.toarray())

Representation of document2:  [[0.         0.49844628 0.35464863 0.         0.         0.
  0.         0.35464863 0.         0.35464863 0.35464863 0.49844628]]


# N-gram Vectorizer(bi-gram)

In [13]:
# Intialize BoWs with ngram_range(2,2) i.e. bigram
count_vect = CountVectorizer(ngram_range=(1, 2))
# fit the corpus to CountVectorizer
count_vect.fit([document1, document2])

CountVectorizer(ngram_range=(1, 2))

In [14]:
print("feature names ", count_vect.get_feature_names())

feature names  ['and', 'and play', 'ball', 'cat', 'cat it', 'cat loves', 'dog', 'dog hates', 'go', 'go out', 'hates', 'hates cat', 'it', 'it loves', 'loves', 'loves to', 'out', 'out and', 'play', 'play with', 'to', 'to go', 'to play', 'with', 'with ball']


In [15]:
# bag of word representation of document1
bow1 = count_vect.transform([document1])
print("Representation of document1: ", bow1.toarray())

Representation of document1:  [[1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 0]]


In [16]:
# bag of word representation of document2
bow2 = count_vect.transform([document2])
print("Representation of document2: ", bow2.toarray())

Representation of document2:  [[0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 0 1 1 1]]
