### Bag of Words
Bag of words is a technique to convert text documents into numeric format where we represent each document as a numeric vector which is equal to the length of the vocabulary. We count the word frequency of each word in a sentence and replace that count with the corresponding word index in that vector.

The resultant document term matrix will contain rows equal to the number of documents in the matrix and columns equal to the vocabulary size.

Advantages: Very simple and efficient way to represent text in vector format.

Drawbacks: We lose word ordering because while we count word frequency, we do not take word ordering into account, which might affect the meaning of a sentence.

In [1]:
import re
import numpy as np

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prasa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [4]:
class BagOfWordsVectorizer:
    def __init__(self, to_lower=True, regEx=None, remove_stopwords=False, preprocessor=None, tokenizer=nltk.tokenize.TreebankWordTokenizer()):
        self._tokenizer = tokenizer
        self._to_lower = to_lower
        self._remove_stopwords = remove_stopwords
        self._PUNCTUATION_RE = '[,.?!;:]'
        if regEx is None:
            self._regEx = self._PUNCTUATION_RE
        else:
            self.regEx = regEx
        self.vocab_ = []
        self._preprocessor = preprocessor             
        
    """ preprocess given text, learns vocabulary """        
    def fit(self, X):
        X_processed = []
        if self._preprocessor is not None:
            X_processed = [self._preprocessor(doc) for doc in X]
        else:
            X_processed = [self._preprocess(doc) for doc in X]
        self.vocab_ = self._get_vocab(X_processed)        

    """ transforms given text to document term matrix, 
        based on the vocab learned from fit method """        
    def transform(self, X):
        result = []
        for doc in X:
            vect = [0]*len(self.vocab_)
            if type(doc)==np.ndarray:
                doc = doc[0]
            words = doc.split()
            for i, v in enumerate(self.vocab_):
                freq = words.count(v)
                vect[i] = freq
            result.append(vect)
        return np.array(result)  

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)
    
    # convert text to lower case, removes punctuaions, tokenizes, remove stop words
    def _preprocess(self, doc):        
        result = []
        if self._to_lower:
            if type(doc)==np.ndarray:
                doc = doc[0]
            doc = doc.lower()
        doc = re.sub(self._regEx, '', doc)
        tokens = self._tokenizer.tokenize(doc)
        for token in tokens:
            if self._remove_stopwords:
                if token not in stop_words:
                    result.append(token)
            else:
                result.append(token)
        return " ".join(result) 
    
    # creates vocabulary from the processed text 
    def _get_vocab(self, X):
        result = []
        for doc in X:           
            tokens = str(doc).split()
            for token in tokens:
                if token not in result:
                    result.append(token)
        return sorted(result) 

In [5]:
# Let's take a sample corpus and analyse bag of words model

In [6]:
corpus = [['This is movie is a nice movie', 1],
          ['Movie is very good', 1],
          ['a very bad movie, really bad', 0],
          ['this is not a good movie', 0],
          ['do not watch this movie', 0]]

In [7]:
import pandas as pd

In [8]:
#load the data into pandas
data = pd.DataFrame(corpus, columns=['review', 'label'])

In [9]:
data.head()

Unnamed: 0,review,label
0,This is movie is a nice movie,1
1,Movie is very good,1
2,"a very bad movie, really bad",0
3,this is not a good movie,0
4,do not watch this movie,0


In [10]:
# get the reivews from dataframe

In [11]:
X = data.iloc[:,0:1].values

In [12]:
X.shape

(5, 1)

In [13]:
#get the labels from dataframe
y = data.iloc[:,1].values

In [14]:
y.shape

(5,)

In [15]:
#Proprocess raw text data

In [16]:
#instantiate, fit, transform
bowVect = BagOfWordsVectorizer(to_lower=True, remove_stopwords=True)

In [17]:
X_processed = bowVect.fit_transform(X)

In [18]:
X_processed

array([[0, 0, 2, 1, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [2, 0, 0, 0, 1, 0],
       [0, 1, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 1]])

In [19]:
# lets visualzie the document term matrix

In [20]:
dtm = pd.DataFrame(X_processed, columns=bowVect.vocab_)

In [21]:
dtm.head()

Unnamed: 0,bad,good,movie,nice,really,watch
0,0,0,2,1,0,0
1,0,1,0,0,0,0
2,2,0,0,0,1,0
3,0,1,1,0,0,0
4,0,0,1,0,0,1


#### Note:  This document term matrix can be used to train a classifer and make predictions

In [22]:
corpus = ['This movie is a nice movie', 'Movie is very good', 'a very bad movie really bad']          

In [23]:
bowVect = BagOfWordsVectorizer()

In [24]:
bowVect.fit_transform(corpus2)

array([[1, 0, 0, 1, 2, 1, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 2, 0, 0, 1, 0, 1, 0, 1]])

In [25]:
bowVect.vocab_

['a', 'bad', 'good', 'is', 'movie', 'nice', 'really', 'this', 'very']