In [21]:
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [22]:
stop_words= set(stopwords. words("english"))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

To extract words and remove stopwords from a text document

In [23]:
def extract_words(sentence):
    words=nltk.word_tokenize(sentence)
    cleaned_text= [w.lower() for w in words if w not in stop_words]
    return cleaned_text

To build a vocabulary, remove duplicates and return sorted list of words

In [24]:
def tokenize_sentence(sentences):
    words= []
    for sentence in sentences:
        w= extract_words(sentence)
        words.extend(w)
    words= sorted(list(set(words)))
    return words

To create a numpy array of frequent counts

In [25]:
def bag_of_words(sentence, words):
    sentence_words= extract_words(sentence)
    # frequent word count
    bag=np.zeros(len(words))
    for sw in sentence_words:
        for i, word in enumerate(words):
            if word == sw:
                bag[i] +=1
    return np.array(bag)

To concert sentences to vectors using bag_of_words

In [26]:
# To generate a vocabulary from text document
corpus= (["John saw the train",
        "The train was late",
        "Max and Rob took the bus",
        "I looked for Max and Rob at the bus station",
        "Max and Rob arrived at the bus station early but waited until noon for the bus"])
vocabulary= tokenize_sentence(corpus)
print(vocabulary)

['arrived', 'bus', 'early', 'i', 'john', 'late', 'looked', 'max', 'noon', 'rob', 'saw', 'station', 'the', 'took', 'train', 'waited']


In [27]:
bag_of_words("Max and Rob took the bus", vocabulary)

array([0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0.])

In [28]:
bag_of_words("Max and Rob arrived at the bus station early but waited until noon for the bus", vocabulary)

array([1., 2., 1., 0., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1.])

In [29]:
bag_of_words("John saw the train", vocabulary)

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.])

# To create a bag of word matrix

In [15]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [30]:
text_data=np.array(["John saw the train",
        "The train was late",
        "Max and Rob took the bus",
        "I looked for Max and Rob at the bus station",
        "Max and Rob arrived at the bus station early but waited until noon for the bus"])
count= CountVectorizer()

In [32]:
bag_of_words= count.fit_transform(text_data)
bag_of_words

<5x21 sparse matrix of type '<class 'numpy.int64'>'
	with 37 stored elements in Compressed Sparse Row format>

Show feature matrix

In [33]:
bag_of_words.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0],
       [1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0],
       [1, 1, 1, 2, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 2, 0, 0, 1, 1, 0]],
      dtype=int64)

To print the feature name

In [35]:
#Get the feature name
feature_name= count.get_feature_names()
feature_name

['and',
 'arrived',
 'at',
 'bus',
 'but',
 'early',
 'for',
 'john',
 'late',
 'looked',
 'max',
 'noon',
 'rob',
 'saw',
 'station',
 'the',
 'took',
 'train',
 'until',
 'waited',
 'was']

To view as a dataframe

In [39]:
pd.DataFrame(bag_of_words.toarray(), columns= feature_name)

Unnamed: 0,and,arrived,at,bus,but,early,for,john,late,looked,...,noon,rob,saw,station,the,took,train,until,waited,was
0,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,1,0,1,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,1,0,0,1
2,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,1,1,0,0,0,0
3,1,0,1,1,0,0,1,0,0,1,...,0,1,0,1,1,0,0,0,0,0
4,1,1,1,2,1,1,1,0,0,0,...,1,1,0,1,2,0,0,1,1,0
