In [1]:
#Implement Bag Of Words

In [18]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from tqdm import tqdm
import os
from collections import Counter
from scipy.sparse import csr_matrix

In [19]:
#write fit function to find the unique words in the sentence and its dimension and store it in dictionary with
#as key and indexes as columns or dimension

#for ease the input to fit function is a list containing list of sentences

def fit(dataset):
    word_unique = set()                #remove duplicate adds hence words will be unique
    for row in dataset:
        for word in row.split(" "):
            if len(word)>=2:          #avoids punctuation and comma as word
                word_unique.add(word)
            else:
                continue
                
    word_unique = sorted(list(word_unique))
    vocab = {j:i for i,j in enumerate(word_unique)}
    return vocab
        
    
    

In [20]:
#test fit function
vocab = fit(["test bag of words", "this is to test fit function of bag of words"])
print(vocab)

{'bag': 0, 'fit': 1, 'function': 2, 'is': 3, 'of': 4, 'test': 5, 'this': 6, 'to': 7, 'words': 8}


In [42]:
#create a sparse matrix containing frequency of the words
def transform(text,vocab) :
    rows=[]
    columns=[]
    values=[]
    if isinstance(text,(list,)):
        for idx,row in enumerate(tqdm(text)):
            word_freq = dict(Counter(row.split()))
            for word,freq in word_freq.items():
                if len(word)<2:
                    continue
                else:
                    col_index = vocab.get(word,-1)
                    if col_index!=-1:
                        rows.append(idx)
                        columns.append(col_index)
                        values.append(freq)
                        
        return csr_matrix((values, (rows,columns)), shape=(len(text),len(vocab)))
        
    else:
        print("you need to pass list of strings")
        
    
    
    

In [43]:
strings = ["the method of lagrange multipliers is the economists workhorse for solving optimization problems",
           "the technique is a centerpiece of economic theory but unfortunately its usually taught poorly"]
vocab = fit(strings)
print(list(vocab.keys()))
print(transform(strings, vocab).toarray())

100%|██████████| 2/2 [00:00<00:00, 6875.91it/s]

['but', 'centerpiece', 'economic', 'economists', 'for', 'is', 'its', 'lagrange', 'method', 'multipliers', 'of', 'optimization', 'poorly', 'problems', 'solving', 'taught', 'technique', 'the', 'theory', 'unfortunately', 'usually', 'workhorse']
[[0 0 0 1 1 1 0 1 1 1 1 1 0 1 1 0 0 2 0 0 0 1]
 [1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 1 1 1 1 1 0]]





In [46]:
#Comparing results with count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(analyzer='word')

vec.fit(strings)
print(vec.get_feature_names())
feature_matrix_2 = vec.transform(strings)
print(feature_matrix_2.toarray())

['but', 'centerpiece', 'economic', 'economists', 'for', 'is', 'its', 'lagrange', 'method', 'multipliers', 'of', 'optimization', 'poorly', 'problems', 'solving', 'taught', 'technique', 'the', 'theory', 'unfortunately', 'usually', 'workhorse']
[[0 0 0 1 1 1 0 1 1 1 1 1 0 1 1 0 0 2 0 0 0 1]
 [1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 1 1 1 1 1 0]]
