# Frequency Based Extraction

## Using Counter Vectorizor

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ["This is the first document",
       "This is the second document",
       "Third document. Document number three",
       "Number four, to repeat number four"]

In [3]:
vectorize = CountVectorizer()

bag_words = vectorize.fit_transform(corpus)

bag_words

<4x12 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [4]:
print(bag_words)

  (0, 9)	1
  (0, 3)	1
  (0, 7)	1
  (0, 1)	1
  (0, 0)	1
  (1, 9)	1
  (1, 3)	1
  (1, 7)	1
  (1, 0)	1
  (1, 6)	1
  (2, 0)	2
  (2, 8)	1
  (2, 4)	1
  (2, 10)	1
  (3, 4)	2
  (3, 2)	2
  (3, 11)	1
  (3, 5)	1


### How to find the id of words in the corpus

In [5]:
vectorize.vocabulary_.get("document")

0

In [6]:
vectorize.vocabulary_

{'this': 9,
 'is': 3,
 'the': 7,
 'first': 1,
 'document': 0,
 'second': 6,
 'third': 8,
 'number': 4,
 'three': 10,
 'four': 2,
 'to': 11,
 'repeat': 5}

### convert the bag of words into a dataframe

In [7]:
import pandas as pd

In [8]:
pd.DataFrame(bag_words.toarray(), columns=vectorize.get_feature_names_out())

Unnamed: 0,document,first,four,is,number,repeat,second,the,third,this,three,to
0,1,1,0,1,0,0,0,1,0,1,0,0
1,1,0,0,1,0,0,1,1,0,1,0,0
2,2,0,0,0,1,0,0,0,1,0,1,0
3,0,0,2,0,2,1,0,0,0,0,0,1


## Using TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [11]:
tf_vectorize = TfidfVectorizer()

bag_word_tf = tf_vectorize.fit_transform(corpus)

In [12]:
bag_word_tf

<4x12 sparse matrix of type '<class 'numpy.float64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [13]:
print(bag_word_tf)

  (0, 0)	0.3528554929793508
  (0, 1)	0.5528163151092931
  (0, 7)	0.43584673254990375
  (0, 3)	0.43584673254990375
  (0, 9)	0.43584673254990375
  (1, 6)	0.5528163151092931
  (1, 0)	0.3528554929793508
  (1, 7)	0.43584673254990375
  (1, 3)	0.43584673254990375
  (1, 9)	0.43584673254990375
  (2, 10)	0.4850008395708102
  (2, 4)	0.3823802326982809
  (2, 8)	0.4850008395708102
  (2, 0)	0.6191395067937654
  (3, 5)	0.3432724906138499
  (3, 11)	0.3432724906138499
  (3, 2)	0.6865449812276998
  (3, 4)	0.5412799489419371


### find the corresponding ID for each word in the corpus

In [14]:
tf_vectorize.vocabulary_.get("document")

0

In [15]:
tf_vectorize.vocabulary_

{'this': 9,
 'is': 3,
 'the': 7,
 'first': 1,
 'document': 0,
 'second': 6,
 'third': 8,
 'number': 4,
 'three': 10,
 'four': 2,
 'to': 11,
 'repeat': 5}

In [16]:
pd.DataFrame(bag_word_tf.toarray(), columns=tf_vectorize.get_feature_names_out())

Unnamed: 0,document,first,four,is,number,repeat,second,the,third,this,three,to
0,0.352855,0.552816,0.0,0.435847,0.0,0.0,0.0,0.435847,0.0,0.435847,0.0,0.0
1,0.352855,0.0,0.0,0.435847,0.0,0.0,0.552816,0.435847,0.0,0.435847,0.0,0.0
2,0.61914,0.0,0.0,0.0,0.38238,0.0,0.0,0.0,0.485001,0.0,0.485001,0.0
3,0.0,0.0,0.686545,0.0,0.54128,0.343272,0.0,0.0,0.0,0.0,0.0,0.343272


## Using HashVectorizer

In [17]:
from sklearn.feature_extraction.text import HashingVectorizer

hs_vectorize = HashingVectorizer(n_features=7)

bag_words_hs = hs_vectorize.fit_transform(corpus)

In [18]:
print(bag_words_hs)

  (0, 0)	0.4472135954999579
  (0, 1)	0.4472135954999579
  (0, 2)	-0.4472135954999579
  (0, 5)	-0.4472135954999579
  (0, 6)	-0.4472135954999579
  (1, 0)	0.3779644730092272
  (1, 1)	0.7559289460184544
  (1, 2)	-0.3779644730092272
  (1, 5)	-0.3779644730092272
  (2, 2)	-0.6666666666666666
  (2, 3)	0.3333333333333333
  (2, 5)	0.6666666666666666
  (3, 0)	0.31622776601683794
  (3, 3)	0.31622776601683794
  (3, 5)	0.6324555320336759
  (3, 6)	0.6324555320336759
