# Naive Bayes Implementation from Scratch

Naive Bayes is most commonly used for Text classification

In [30]:
import numpy as np
from collections import defaultdict
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import sqlite3
import nltk
import string

In [31]:
class NaiveBayes:
    """
    This class contains four methods:
    1. def addSentToBow(self,example_sentence,bow_dict_index)
    2. def train(self,X,y):
    3. def getPosteriorProba(self,test_sent)
    4. def test(self,test_data)
    
    Through these 4 methods we can train a Naive Bayes Classifier
    """
    
    def __init__(self,unique_labels):
        self.classes = unique_labels
    
    def addSentToBow(self,example_sentence,bow_dict_index):
        """
        Parameters:
        -----------
        
        -> example_sentance : is the sentence we pass to test/train
        -> bow_dict_index : or class is a dict to which our example_sent belongs to
        i.e if we have two classes then we will maintain 2 bow dicts
        
        
        Functionality:
        --------------
        It will tokenize the sentence and adds every token to its respective
        BOW dict(label parameter in here)
        
        """
        
        #checking if example sent is of type nd array i.e array([]) and getting the sentence
        if(isinstance(example_sentence,np.ndarray)):
            example_sentence = example_sentence[0]
        
        
        for word in example_sentence.split():
            '''
                Creating BOW dict:
                {
                    class1:{
                        word1:count,
                        word2:count
                    }
                    class2:{
                        word1:count,
                        word2:count
                    }
                }
            '''
            self.bow_label_dict[bow_dict_index][word]+=1
            
    
    def train(self,X,y):
        """
        
        Parameters:
        -----------
        
        1. X 
        2. y i.e target labels
        """
        
        self.data = X
        self.labels=y
        # Creating dict of n dicts, where n is no.of classes
        self.bow_label_dict = np.array([defaultdict(lambda:0) for index in range(self.classes.shape[0])])
        
        # converting data to numpy arrays
        if not isinstance(self.data,np.ndarray): 
            self.data=np.array(self.data)
        if not isinstance(self.labels,np.ndarray): 
            self.labels=np.array(self.labels)
        
        #Creating BOW for each Class
        
        #enumerate method will return tuple (counter,element in collection)
        # here if we have 3 classes it will return (0,1),(1,2),(2,3)
        for index,label in enumerate(self.classes):
            
            sentOfParticularLabel = self.data[self.labels==label]
            
            cleaned_sent = [preprocess_string(sent) for sent in sentOfParticularLabel]
            cleaned_sent=pd.DataFrame(data=cleaned_sent)
            
            #now costruct BoW of this particular label
            '''
            -> addSentToBow is method
            -> 1 -> indicates axis i.e Column
            -> cleaned_sent - is a Data frame where column values are sentences
            -> index - param accepted by addSentToBow method
            
            here we have only two cols one is index col and other is Sentences column
            now operation is performed on sentence column
            '''
            
            np.apply_along_axis(self.addSentToBow,1,cleaned_sent,index)
            
            '''
            Done with constructing BOW of each Label/category.
            
            Now we need calculate the terms of our formula:
            
            -> Prior Probability of each class - P(c) -> (no.of sentences beloning to class c)/(total no.of sent)
            -> Word Corpus/ Vocabulary |V|
            -> count(c) + |v| + 1 i.e denominator val of each label/class
            
            'c' here is Class/label
            '''
            
            
        # Creating a array to store the probability of 
        # of sentence for each class
        proba_label = np.empty(self.classes.shape[0])
            
        all_words = []
            
        count_of_words_in_each_class = np.empty(self.classes.shape[0])
            
        for index, label in enumerate(self.classes):
            
            #Prior probability P(c) for each class
            '''
            labels-> target column values i.e y
            label -> one of the selected label for this iteration
            '''
            proba_label[index] = np.sum(self.labels==label)/float(self.labels.shape[0])
            
            # getting total counts of all words in each class
            count = list(self.bow_label_dict[index].values())
            
            count_of_words_in_each_class[index] = np.sum(np.array(count))+1
            
            all_words+=self.bow_label_dict[index].keys()
            
        # now combining all the words of all the classes to get |V|
        self.vocab = np.unique(np.array(all_words))
        
        self.lenOfVocab = self.vocab.shape[0]
        
        #Computing denominators of each class i.e (count(c) + |v| + 1) i.e
        # count of words of that class + Vocab|v|(adding this to avoide 0 probability 
        # when word is unknown)+1
        denom = np.array([count_of_words_in_each_class[index]+self.lenOfVocab+1 for index,label in enumerate(self.classes)])
        
        
        # putting pieces of formula in organized way
        # in this way (bow dict of each class,prior probability,denominator)
        self.classes_info = [(self.bow_label_dict[index],proba_label[index],denom[index]) for index,labels in enumerate(self.classes)]
        self.classes_info = np.array(self.classes_info)
    
    
    def getPosteriorProba(self,test_sent):
        '''
        Parameter:
        ----------
        test_example -> Example sentence 
        
        -> This function returns Posterior probability of given sentence
        
        Returns:
        --------
        Probability that belongs to each class
        
        '''
        # To store probabilities of each class
        likelihood_prob = np.zeros(self.classes.shape[0])
        
        # Calculating Probabilities w.r.t to each label/class
        # Formula -> (count of word w.r.t class c +1)/(count wof words in that class + vocab_len + 1)
        for index,labels in enumerate(self.classes):
            
            for word in test_sent.split():
                
                # numerator in the above formula
                # adding +1 to get rid of zero probability i.e even when the word is not present in our training vocab
                # then also probability will not be zero as we are adding 1 it will be 1/something.
                word_count = self.classes_info[index][0].get(word,0)+1
                
                
                word_proba = word_count/float(self.classes_info[index][2])
                
                # Applying Log so that it enhances the minute values,
                # our actual way is to multiply each words probability
                # but we are applying log so we need to sum all these 
                # log applied probabilities of each word
                likelihood_prob[index]+=np.log(word_proba)
                
        post_proba = np.empty(self.classes.shape[0])
        for index,labels in enumerate(self.classes):
            post_proba[index] = likelihood_prob[index]+np.log(self.classes_info[index][1])
        
        return post_proba;
    
    
    
    def test(self,test_data):
        '''
        
        Parameters:
        -----------
        test_data -> test sentences of which we need to predict the probabilities
        
        
        Functionality:
        --------------
        Gets the proba of each sentence w.r.t each class and gets final prediction
        
        Returns:
        --------
        Predicted Class labels of each test Sentence
        '''
        
        pred = []
        
        for sent in test_data:
            
            cleaned_sent = preprocess_string(sent)
            
            post_proba = self.getPosteriorProba(cleaned_sent)
            
            pred.append(self.classes[np.argmax(post_proba)])
        return np.array(pred)

In [32]:
def preprocess_string(str_arg):
    
    """"
        Parameters:
        ----------
        str_arg: example string to be preprocessed
        
        What the function does?
        -----------------------
        Preprocess the string argument - str_arg - such that :
        1. everything apart from letters is excluded
        2. multiple spaces are replaced by single space
        3. str_arg is converted to lower case 
        
        Example:
        --------
        Input :  Menu is absolutely perfect,loved it!
        Output:  ['menu', 'is', 'absolutely', 'perfect', 'loved', 'it']
        
        Returns:
        ---------
        Preprocessed string 
        
    """
    
    cleaned_str=re.sub('[^a-z\s]+',' ',str_arg,flags=re.IGNORECASE) #every char except alphabets is replaced
    cleaned_str=re.sub('(\s+)',' ',cleaned_str) #multiple spaces are replaced by single space
    cleaned_str=cleaned_str.lower() #converting the cleaned string to lower case
    
    return cleaned_str

## Amazon Review Classification

In [6]:
con = sqlite3.connect("database.sqlite")

In [7]:
filtered_data = pd.read_sql_query("""select * from Reviews where
score != 3
""",con)

In [8]:
final = filtered_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"},keep="first",inplace=False)
final.shape

(364173, 10)

In [9]:
def partition(x):
    if x<3:
        return 0
    return 1


actualScore = final['Score']
positiveNegative = actualScore.map(partition)
final['Score'] = positiveNegative

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [10]:
del filtered_data

In [25]:
X = list(final['Text'][0:80000])
y = list(final['Score'][0:80000])

In [26]:

train_data,test_data,train_labels,test_labels=train_test_split(X,y,shuffle=True,test_size=0.25,random_state=42,stratify=y)
classes=np.unique(train_labels)

In [27]:
nb1 = NaiveBayes(np.array([0,1]))

In [28]:
nb1.train(train_data,train_labels)

In [29]:
proba_sent1=nb1.test(test_data)

#accuracy
test_acc=np.sum(proba_sent1==test_labels)/float(len(test_labels)) 

print ("Test Set Accuracy: ",test_acc*100,"%") 

Test Set Accuracy:  90.31 %


# Summary:
-  We have created our Own Naive Bayes Classifier from Scratch using only 4 Methods
-  Used __Amazon Reviews DataSet__ to demonstrate our Classifier
-  Achieved __90.31%__ Accuracy with __80000__ data points