# Naive Bayes

In [92]:
import numpy as np

class BernoulliNaiveBayesClassifier(object):
    
    def __init__(self):
        self.labels = None
        self.priors = None
        self.P = None
        
        
    def fit(self, X, y, a=0):
        """ Fit a naive Bayes model 
            Input:
                X:  2-dim binary Array of shape (m,n), 
                    where m is the number of documents and n is the size of the vocaulary.
                    Rows correspond to documents and columns correspond to words.
                    X[i,j] = 1  if document i cointains word j
                    X[i,j] = 0  if document i does not cointain word j
                y:  1-dim Array of length m containing the class label of each document in X
                a: LaPlace Smoothing Parameter
        """
        X = np.array(X)
        y = np.array(y)
        m,n = X.shape
        self.labels, counts = np.unique(y, return_counts=True)
        self.priors = counts/m
        # likelihoods
        self.P = np.zeros((len(self.labels),n))
        for k in range(len(self.labels)):
            self.P[k] = (np.sum(X[np.where(y==self.labels[k])], axis=0) + a) / (counts[k] + a*n)
        
        
    def predict(self, X):
        X = np.array(X)
        H = self.priors * np.prod(np.power(self.P,X)*np.power(1-self.P,1-X), axis=1)
        print(H)
        prediction = self.labels[np.argmax(H)]
        return prediction
        
        
    def evaluate(self, X, y):
        """ Evaluates the fitted Naive Bayes model on test data X,y
            Input:
                X: Test data, format as described in fit()
                y: Test labels, format as described in fit()
        """
        X = np.array(X)
        y = np.array(y)
        yp = self.predict(X)
        
        a = y+(10*yp)
        b,counts = np.unique(a,return_counts=True)
        tp = counts[0]
        fn = counts[1] 
        fp = counts[2]
        tn = counts[3]
        
        acc = (tp *tn)/(tp+tn+fp+fn)
        pre = tp/(tp+fp)
        rec = tp/(tp+fn)
        
        print('Accuracy:', acc)
        print('Precision:', pre)
        print('Recall:', rec)

        
        
class MultinomialNaiveBayesClassifier(object):
    
    def __init__(self):
        self.labels = None
        self.priors = None
        self.P = None
        
        
    def fit(self, X, y, a=0):
        """ Fit a naive Bayes model 
            Input:
                X:  2-dim Array of shape (m,n), 
                    where m is the number of documents and n is the size of the vocaulary.
                    Rows correspond to documents and columns correspond to words.
                    X[i,j] = k  if document i cointains k times the word j
                y:  1-dim Array of length m containing the class label of each document in X
        """
        X = np.array(X)
        y = np.array(y)
        m,n = X.shape
        self.labels, counts = np.unique(y, return_counts=True)
        self.priors = counts/m
        # likelihoods
        self.P = np.zeros((len(self.labels),n))
        for k in range(len(self.labels)):
            temp = X[np.where(y==self.labels[k])]
            self.P[k] = (np.sum(temp, axis=0) + a) / (np.sum(temp) + a*n)
        
        
    def predict(self, X):
        X = np.array(X)
        H = self.priors * np.prod(np.power(self.P,X), axis=1)
        print(H)
        prediction = self.labels[np.argmax(H)]
        return prediction
        
        
    def evaluate(self, X, y):
        """ Evaluates the fitted Naive Bayes model on test data X,y
            Input:
                X: Test data, format as described in fit()
                y: Test labels, format as described in fit()
        """
        print('Please implement the method evaluate')
        acc = 0
        pre = 0
        rec = 0
        print('Accuracy:', acc)
        print('Precision:', pre)
        print('Recall:', rec)


## Demo, just for inspiration:

In [95]:
from sklearn.feature_extraction.text import CountVectorizer

# toy data
corpus = [
     'This is the first document.',
     'This is the second second document.',
     'And the third one.',
     'Is this the first document?',]

# toy labels
y = [0,1,1,0]

# extract features 
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print('Vocabulary: ', vectorizer.get_feature_names())
print('Feature Matrix:', X.toarray())


# instantiate a classifier
nb = MultinomialNaiveBayesClassifier()

# fit the model
nb.fit(X.toarray(),y)

# predict class of a new document
new_text = 'This is some new document. Please classify me.'
X_new = vectorizer.transform([new_text]).toarray()
print('New Feature Matrix:',X_new)
nb.predict(X_new)

# evaluate the model
test_corpus = [
     'This is the first test document.',
     'This is the second test second document.',
     'And the third one.',
     'Is this the first document?',]

X_test = vectorizer.transform(test_corpus).toarray()
print('Feature matrix of test data:',X_test)
y_test = [0,1,1,1]

nb.evaluate(X_test,y_test)

Vocabulary:  ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
Feature Matrix: [[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]
New Feature Matrix: [[0 1 0 1 0 0 0 0 1]]
[ 0.004   0.0005]
Feature matrix of test data: [[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]
Please implement the method evaluate
Accuracy: 0
Precision: 0
Recall: 0


In [57]:
X = np.array([[1,0,1],[0,1,1],[1,1,0],[0,0,0],[0,0,1]])
y = np.array(['h','s','h','h','h'])

m,n = X.shape
labels, counts = np.unique(y, return_counts=True)
priors = counts/m
        
# likelihoods
P = np.zeros((len(labels),n))
for k in range(len(labels)):
    P[k] = np.sum(X[np.where(y==labels[k])], axis=0) / counts[k]

In [59]:
for p in P:
    print(p)

[ 0.5   0.25  0.5 ]
[ 0.  1.  1.]


In [63]:
x = np.array([1,0,1])
np.power(P[0],x)


array([ 0.5,  1. ,  0.5])

In [64]:
np.power(1-P[0],1-x)

array([ 1.  ,  0.75,  1.  ])

In [67]:
f=np.power(P,x)

In [71]:
np.prod(f, axis=1)

array([ 0.25,  0.  ])

In [73]:
h = np.prod(np.power(P,x)*np.power(1-P,1-x), axis=1)

In [87]:
h*h

array([ 0.03515625,  0.        ])

In [85]:
from sklearn.feature_extraction.text import CountVectorizer

# toy data
corpus = [
     'This is the first document.',
     'This is the second second document.',
     'And the third one.',
     'Is this the first document?',]

# toy labels
y = [0,1,1,0]

# extract features 
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print('Vocabulary: ', vectorizer.get_feature_names())
print('Feature Matrix:', X.toarray())


# instantiate a classifier
nb = MultinomialNaiveBayesClassifier()

# fit the model
nb.fit(X,y)

# predict class of a new document
new_text = 'This is some new document. Please classify me.'
X_new = vectorizer.transform([new_text]).toarray()
print('New Feature Matrix:',X_new)
nb.predict(X_new)

# evaluate the model
test_corpus = [
     'This is the first test document.',
     'This is the second test second document.',
     'And the third one.',
     'Is this the first document?',]

X_test = vectorizer.transform(test_corpus).toarray()
print('Feature matrix of test data:',X_test)
y_test = [0,1,1,1]

nb.evaluate(X_test,y_test)

Vocabulary:  ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
Feature Matrix: [[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]
This is the constructor
Please implement the method fit
New Feature Matrix: [[0 1 0 1 0 0 0 0 1]]
Please implement the method predict
Feature matrix of test data: [[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]
Please implement the method evaluate
Accuracy: 0
Precision: 0
Recall: 0


In [119]:
yt=np.array([1,0,1,0,1])
yp=np.array([1,0,1,1,0])

a =yt+(10*yp)
labels, counts = np.unique(a,return_counts=True)

In [115]:
my_dict = {11:'tn', 0:'tp', 1:'fp', 10:'fn'}
np.vectorize(my_dict.get)(a)

array(['tn', 'tp', 'fn', 'fp'], 
      dtype='<U2')

In [120]:
counts

array([1, 1, 1, 2])

In [121]:
labels

array([ 0,  1, 10, 11])