In [1]:
from nltk.tokenize import word_tokenize
import pandas as pd

In [2]:
sent1 = 'It is a good practice for us.'
sent2 = 'It was also good to know about it.'

In [3]:
bow = {}

# Tokenize the sentences

tokens1 = word_tokenize(sent1.lower())
tokens2 = word_tokenize(sent2.lower())

In [4]:
# tokens1 = list(map(str.lower, tokens1))
# tokens2 = list(map(str.lower, tokens2))

In [5]:
tokens = tokens1 + tokens2
tokens

['it',
 'is',
 'a',
 'good',
 'practice',
 'for',
 'us',
 '.',
 'it',
 'was',
 'also',
 'good',
 'to',
 'know',
 'about',
 'it',
 '.']

In [7]:
bow = list(set(tokens))
bow

['also',
 'about',
 'a',
 'us',
 'is',
 'for',
 'practice',
 'it',
 'to',
 'know',
 'was',
 '.',
 'good']

In [8]:
bow_df = pd.DataFrame(index=[1,2], columns=bow)
bow_df

Unnamed: 0,also,about,a,us,is,for,practice,it,to,know,was,.,good
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,


In [9]:
counts1 = [tokens1.count(word) for word in bow_df.columns] # count words in sentence 1
counts2 = [tokens2.count(word) for word in bow_df.columns] # count word for sentence 2

In [10]:
bow_df.iloc[0,:] = counts1 # filling the first row with counts1
bow_df.iloc[1,:] = counts2 # filling the second row with count2

In [11]:
bow_df

Unnamed: 0,also,about,a,us,is,for,practice,it,to,know,was,.,good
1,0,0,1,1,1,1,1,1,0,0,0,1,1
2,1,1,0,0,0,0,0,2,1,1,1,1,1


### Second method

In [12]:
from collections import Counter

In [13]:
bow_df1 = pd.DataFrame(index=[1,2], columns=bow)
bow_df1

Unnamed: 0,also,about,a,us,is,for,practice,it,to,know,was,.,good
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,


In [14]:
bow_df1.fillna(0, inplace=True)
bow_df1

  bow_df1.fillna(0, inplace=True)


Unnamed: 0,also,about,a,us,is,for,practice,it,to,know,was,.,good
1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0


### OR

In [16]:
counted1 = Counter(tokens1)
counted1

Counter({'it': 1,
         'is': 1,
         'a': 1,
         'good': 1,
         'practice': 1,
         'for': 1,
         'us': 1,
         '.': 1})

In [17]:
counted2 = Counter(tokens2)
counted2

Counter({'it': 2,
         'was': 1,
         'also': 1,
         'good': 1,
         'to': 1,
         'know': 1,
         'about': 1,
         '.': 1})

In [18]:
for k in counted1:
    bow_df1.loc[1,k] += counted1[k]

bow_df1

Unnamed: 0,also,about,a,us,is,for,practice,it,to,know,was,.,good
1,0,0,1,1,1,1,1,1,0,0,0,1,1
2,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
for k in counted2:
    bow_df1.loc[2,k] += counted2[k]

bow_df1

Unnamed: 0,also,about,a,us,is,for,practice,it,to,know,was,.,good
1,0,0,1,1,1,1,1,1,0,0,0,1,1
2,1,1,0,0,0,0,0,2,1,1,1,1,1


### Third method

### Using built in library

In [20]:
# import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
# instantiate CountVectorizer
cvt = CountVectorizer()

In [28]:
x_new = cvt.fit_transform([sent1, sent2])

In [30]:
x_new

<2x11 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [148]:
x_new.toarray()

array([[0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0],
       [1, 1, 0, 1, 0, 2, 1, 0, 1, 0, 1]], dtype=int64)

In [172]:
cvt.get_feature_names_out()

array(['about', 'also', 'for', 'good', 'is', 'it', 'know', 'practice',
       'to', 'us', 'was'], dtype=object)

In [None]:
# making a dataframe with the data from cvt.fit_transform and columns as unique names
df = pd.DataFrame(data=x_new.toarray(), columns=cvt.get_feature_names_out())
df

Unnamed: 0,about,also,for,good,is,it,know,practice,to,us,was
0,0,0,1,1,1,1,0,1,0,1,0
1,1,1,0,1,0,2,1,0,1,0,1


In [None]:
new = 'It was good for us.'

In [179]:
new_features = cvt.transform([new])
new_features.toarray()

array([[0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1]], dtype=int64)

In [182]:
help(CountVectorizer)

Help on class CountVectorizer in module sklearn.feature_extraction.text:

class CountVectorizer(_VectorizerMixin, sklearn.base.BaseEstimator)
 |  CountVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>)
 |
 |  Convert a collection of text documents to a matrix of token counts.
 |
 |  This implementation produces a sparse representation of the counts using
 |  scipy.sparse.csr_matrix.
 |
 |  If you do not provide an a-priori dictionary and you do not use an analyzer
 |  that does some kind of feature selection then the number of features will
 |  be equal to the vocabulary size found by analyzing the data.
 |
 |  For an efficiency comparison of the different feature extractors, see
 |  :ref:`sphx_glr_auto_examp

### N-gram df using CountVectorizer for counting tokens with the given range of words

In [None]:
ngram = CountVectorizer(ngram_range=(1,2)) # one word frequency or 2 word frequency per token

In [None]:
new_ng = ngram.fit_transform([sent1, sent2])

In [None]:
ndf = pd.DataFrame(data=new_ng.toarray(), columns=ngram.get_feature_names_out())

In [188]:
ndf

Unnamed: 0,about,about it,also,also good,for,for us,good,good practice,good to,is,...,it was,know,know about,practice,practice for,to,to know,us,was,was also
0,0,0,0,0,1,1,1,1,0,1,...,0,0,0,1,1,0,0,1,0,0
1,1,1,1,1,0,0,1,0,1,0,...,1,1,1,0,0,1,1,0,1,1


In [189]:
ngram = CountVectorizer(ngram_range=(2,2))
new_ng = ngram.fit_transform([sent1, sent2])


In [190]:
ndf = pd.DataFrame(data=new_ng.toarray(), columns=ngram.get_feature_names_out())
ndf

Unnamed: 0,about it,also good,for us,good practice,good to,is good,it is,it was,know about,practice for,to know,was also
0,0,0,1,1,0,1,1,0,0,1,0,0
1,1,1,0,0,1,0,0,1,1,0,1,1
