In [2]:
import pandas as pd

In [3]:
data=pd.DataFrame({"text":["people watch cricket","cricket watch cricket","people give comment","cricket give comment"],"output":[1,1,0,0]})

In [4]:
print(data)

                    text  output
0   people watch cricket       1
1  cricket watch cricket       1
2    people give comment       0
3   cricket give comment       0


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
data["text"]

0     people watch cricket
1    cricket watch cricket
2      people give comment
3     cricket give comment
Name: text, dtype: object

In [7]:
BOW = CountVectorizer()

In [8]:
document_matrix = BOW.fit_transform(data["text"])

In [9]:
document_matrix

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 11 stored elements and shape (4, 5)>

In [10]:
BOW.vocabulary_ # 0,1,2,........... are indexes
# comment,cricket,give,people,watch --> arranged in alphabetical order

{'people': 3, 'watch': 4, 'cricket': 1, 'give': 2, 'comment': 0}

In [11]:
document_matrix[0].toarray()

array([[0, 1, 0, 1, 1]])

### they are arranging the words by the sequence of the dictionary like according to the increasing order of frequencies

In [12]:
document_matrix[1].toarray()

array([[0, 2, 0, 0, 1]])

In [13]:
for i in range(4):
    print(i,"--",document_matrix[i].toarray())

0 -- [[0 1 0 1 1]]
1 -- [[0 2 0 0 1]]
2 -- [[1 0 1 1 0]]
3 -- [[1 1 1 0 0]]


In [14]:
bigram = CountVectorizer(ngram_range=(2,2))

In [15]:
bigram.fit_transform(data["text"])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 8 stored elements and shape (4, 6)>

In [16]:
bigram.vocabulary_

{'people watch': 4,
 'watch cricket': 5,
 'cricket watch': 1,
 'people give': 3,
 'give comment': 2,
 'cricket give': 0}

In [17]:
trigram = CountVectorizer(ngram_range=(3,3))

In [18]:
trigram.fit_transform(data["text"])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 4 stored elements and shape (4, 4)>

In [19]:
trigram.vocabulary_

{'people watch cricket': 3,
 'cricket watch cricket': 1,
 'people give comment': 2,
 'cricket give comment': 0}

#### CountVectorizer library calculates the information in the sequence only, it doesn't jump from the one word to third word and so on

In [20]:
doc1 = "This is the first document"
doc2 = "This is the second document"
doc3 = "Third document"
doc4 = "Fourth Document is written"

In [21]:
corpus = [doc1,doc2,doc3,doc4]

In [22]:
corpus

['This is the first document',
 'This is the second document',
 'Third document',
 'Fourth Document is written']

In [23]:
df = pd.DataFrame(corpus)

In [24]:
df.head()

Unnamed: 0,0
0,This is the first document
1,This is the second document
2,Third document
3,Fourth Document is written


In [25]:
cv = CountVectorizer()

In [26]:
cv.fit_transform(df[0])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 16 stored elements and shape (4, 9)>

In [27]:
cv.vocabulary_

{'this': 7,
 'is': 3,
 'the': 5,
 'first': 1,
 'document': 0,
 'second': 4,
 'third': 6,
 'fourth': 2,
 'written': 8}

In [28]:
cv1 = CountVectorizer(ngram_range=(1,3)) # mixed vocabulary

In [29]:
cv1.fit_transform(df[0])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 36 stored elements and shape (4, 26)>

In [30]:
cv1.vocabulary_

{'this': 22,
 'is': 8,
 'the': 15,
 'first': 3,
 'document': 0,
 'this is': 23,
 'is the': 9,
 'the first': 16,
 'first document': 4,
 'this is the': 24,
 'is the first': 10,
 'the first document': 17,
 'second': 13,
 'the second': 18,
 'second document': 14,
 'is the second': 11,
 'the second document': 19,
 'third': 20,
 'third document': 21,
 'fourth': 5,
 'written': 25,
 'fourth document': 6,
 'document is': 1,
 'is written': 12,
 'fourth document is': 7,
 'document is written': 2}

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
data

Unnamed: 0,text,output
0,people watch cricket,1
1,cricket watch cricket,1
2,people give comment,0
3,cricket give comment,0


In [33]:
data["text"]

0     people watch cricket
1    cricket watch cricket
2      people give comment
3     cricket give comment
Name: text, dtype: object

In [34]:
tfidf = TfidfVectorizer()

In [None]:
tfidf.fit_transform(data["text"]).toarray()
# comment -- cricket -- give -- people -- watch     

array([[0.        , 0.49681612, 0.        , 0.61366674, 0.61366674],
       [0.        , 0.8508161 , 0.        , 0.        , 0.52546357],
       [0.57735027, 0.        , 0.57735027, 0.57735027, 0.        ],
       [0.61366674, 0.49681612, 0.61366674, 0.        , 0.        ]])

In [38]:
feature_names = tfidf.get_feature_names_out() # to get the vocabulary in the alphabetical order
print("Feature Names : {}".format(feature_names))

Feature Names : ['comment' 'cricket' 'give' 'people' 'watch']


In [39]:
tfidf.idf_

array([1.51082562, 1.22314355, 1.51082562, 1.51082562, 1.51082562])

In [43]:
tf = tfidf.get_stop_words()

In [45]:
print(tf)

None


In [51]:
import numpy as np 
(np.log(4/3)) * 1/3

np.float64(0.09589402415059362)