In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from pandas import DataFrame

In [0]:
def create_document_term_matrix(message_list, vectorizer):
    doc_term_matrix = vectorizer.fit_transform(message_list)
    return DataFrame(doc_term_matrix.toarray(), 
                     columns=vectorizer.get_feature_names())

In [0]:
msg_1 = ["My name is Bhavesh",
        "Please subscribe to my YouTube channel"]

In [0]:
count_vect = CountVectorizer()

In [0]:
create_document_term_matrix(msg_1, count_vect)

Unnamed: 0,bhavesh,channel,is,my,name,please,subscribe,to,youtube
0,1,0,1,1,1,0,0,0,0
1,0,1,0,1,0,1,1,1,1


# TF-IDF

## Term Frequency (tf)



$$
t f_{i, j}=\frac{n_{i, j}}{\sum_{k} n_{i, j}}
$$

## Inverse Data Frequency (idf)

$$
i d f(w)=\log \left(\frac{N}{d f_{t}}\right)
$$

In [0]:
msg_2 = ["Bhavesh is my name",
         "Bhavesh likes Python programming language"]

In [0]:
tfidf_vect = TfidfVectorizer()

In [0]:
create_document_term_matrix(msg_2, tfidf_vect)

Unnamed: 0,bhavesh,is,language,likes,my,name,programming,python
0,0.379978,0.534046,0.0,0.0,0.534046,0.534046,0.0,0.0
1,0.335176,0.0,0.471078,0.471078,0.0,0.0,0.471078,0.471078


In [0]:
msg_3 = ["Bhavesh Bhavesh Bhavesh is my name",
         "Bhavesh likes Python programming language"]

In [0]:
create_document_term_matrix(msg_3, tfidf_vect)

Unnamed: 0,bhavesh,is,language,likes,my,name,programming,python
0,0.776515,0.363788,0.0,0.0,0.363788,0.363788,0.0,0.0
1,0.335176,0.0,0.471078,0.471078,0.0,0.0,0.471078,0.471078


In [0]:
msg_4 = ["Bhavesh Bhavesh Bhavesh is my name",
         "I like Python programming language"]

In [0]:
create_document_term_matrix(msg_4, tfidf_vect)

Unnamed: 0,bhavesh,is,language,like,my,name,programming,python
0,0.866025,0.288675,0.0,0.0,0.288675,0.288675,0.0,0.0
1,0.0,0.0,0.5,0.5,0.0,0.0,0.5,0.5
