# Naive Bayes

In [211]:
import numpy as np

# Hilfsfunktion
def stats(y,yp):
    temp = y+(10*yp)
    tp,tn,fp,fn =0,0,0,0
    for i in temp:
        if i==0:
            tn+=1
        elif i==1:
            fp+=1
        elif i==10:
            fn+=1
        else:
            tp+=1
        
    acc = (tp +tn)/(tp+tn+fp+fn)
    if tp==0:
        pre,rec = 0,0
    else:
        pre = tp/(tp+fp)
        rec = tp/(tp+fn)
    
    return acc, pre, rec

## 6.1, 6.2, 6.3

In [184]:
class BernoulliNaiveBayesClassifier(object):
    
    def __init__(self):
        self.labels = None
        self.priors = None
        self.P = None
        
        
    def fit(self, X, y, a=0):
        """ Fit a naive Bayes model 
            Input:
                X:  2-dim binary Array of shape (m,n), 
                    where m is the number of documents and n is the size of the vocaulary.
                    Rows correspond to documents and columns correspond to words.
                    X[i,j] = 1  if document i cointains word j
                    X[i,j] = 0  if document i does not cointain word j
                y:  1-dim Array of length m containing the class label of each document in X
                a: LaPlace Smoothing Parameter
        """
        X = np.array(X)
        y = np.array(y)
        m,n = X.shape
        self.labels, counts = np.unique(y, return_counts=True)
        self.priors = counts/m
        # likelihoods
        self.P = np.zeros((len(self.labels),n))
        for k in range(len(self.labels)):
            self.P[k] = (np.sum(X[np.where(y==self.labels[k])], axis=0) + a) / (counts[k] + a*n)
        
        
    def predict(self, X):
        X = np.array(X)
        H = self.priors * np.prod(np.power(self.P,X)*np.power(1-self.P,1-X), axis=1)
        #print(H)
        prediction = self.labels[np.argmax(H)]
        return prediction
        
        
    def evaluate(self, X, y):
        """ Evaluates the fitted Naive Bayes model on test data X,y
            Input:
                X: Test data, format as described in fit()
                y: Test labels, format as described in fit()
        """
        X = np.array(X)
        y = np.array(y)
        yp = []
        for x in X:
            yp.append(self.predict(x))
        yp = np.array(yp)
        acc, pre, rec = stats(y,yp)
        print('Accuracy:', acc)
        print('Precision:', pre)
        print('Recall:', rec)

        
        
class MultinomialNaiveBayesClassifier(object):
    
    def __init__(self):
        self.labels = None
        self.priors = None
        self.P = None
        
        
    def fit(self, X, y, a=0):
        """ Fit a naive Bayes model 
            Input:
                X:  2-dim Array of shape (m,n), 
                    where m is the number of documents and n is the size of the vocaulary.
                    Rows correspond to documents and columns correspond to words.
                    X[i,j] = k  if document i cointains k times the word j
                y:  1-dim Array of length m containing the class label of each document in X
        """
        X = np.array(X)
        y = np.array(y)
        m,n = X.shape
        self.labels, counts = np.unique(y, return_counts=True)
        self.priors = counts/m
        # likelihoods
        self.P = np.zeros((len(self.labels),n))
        for k in range(len(self.labels)):
            temp = X[np.where(y==self.labels[k])]
            self.P[k] = (np.sum(temp, axis=0) + a) / (np.sum(temp) + a*n)
        
        
    def predict(self, X):
        X = np.array(X)
        H = self.priors * np.prod(np.power(self.P,X), axis=1)
        #print(H)
        prediction = self.labels[np.argmax(H)]
        return prediction
        
        
    def evaluate(self, X, y):
        """ Evaluates the fitted Naive Bayes model on test data X,y
            Input:
                X: Test data, format as described in fit()
                y: Test labels, format as described in fit()
        """
        X = np.array(X)
        y = np.array(y)
        yp = []
        for x in X:
            yp.append(self.predict(x))
        yp = np.array(yp)
        acc, pre, rec = stats(y,yp)
        print('Accuracy:', acc)
        print('Precision:', pre)
        print('Recall:', rec)

## 6.4

In [200]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# read data
df = pd.read_csv('SMSSpamCollection.csv',delimiter='\t',header=None)
data = df.values

# prepare data
x,y = data[:,1], data[:,0]
encode = {'ham':0, 'spam':1}
y = np.vectorize(encode.get)(y)

#train-test split
x_train, y_train = x[:4500], y[:4500]
x_test, y_test = x[4500:], y[4500:]


def ex64(classifier='multinomial', ngram=(1,1)):
    global x_train, y_train, x_test, y_test
    
    # vectorize Input
    vectorizer = CountVectorizer(ngram_range=ngram)
    X_train = vectorizer.fit_transform(x_train).toarray()
    X_test = vectorizer.transform(x_test).toarray()
    
    # instantiate a classifier
    if classifier == 'multinomial':
        nb = MultinomialNaiveBayesClassifier()
    elif classifier == 'bernoulli':
        nb = BernoulliNaiveBayesClassifier()
        
    # fit the model
    nb.fit(X_train,y_train)

    # evaluate
    nb.evaluate(X_test,y_test)

## 6.5

In [205]:
ex64('bernoulli',(1,1))

Accuracy: 0.9589552238805971
Precision: 0.7163120567375887
Recall: 0.9619047619047619


In [206]:
ex64('bernoulli',(1,2))

Accuracy: 0.9449626865671642
Precision: 0.6099290780141844
Recall: 0.9555555555555556


In [207]:
ex64('multinomial',(1,1))

Accuracy: 0.9580223880597015
Precision: 0.7304964539007093
Recall: 0.9363636363636364


In [208]:
ex64('multinomial',(1,2))

Accuracy: 0.9430970149253731
Precision: 0.6099290780141844
Recall: 0.9347826086956522


## 6.6

In [190]:
class HamClassifier(object):
    
    def predict(self, X):
        return 0
        
    def evaluate(self, X, y):
        """ Evaluates the fitted Naive Bayes model on test data X,y
            Input:
                X: Test data, format as described in fit()
                y: Test labels, format as described in fit()
        """
        X = np.array(X)
        y = np.array(y)
        yp = []
        for x in X:
            yp.append(self.predict(x))
        yp = np.array(yp)
        acc, pre, rec = stats(y,yp)
        print('Accuracy:', acc)
        print('Precision:', pre)
        print('Recall:', rec)
       
    
class RandomClassifier(object):
    
    def predict(self, X):
        return np.random.randint(2)
        
    def evaluate(self, X, y):
        """ Evaluates the fitted Naive Bayes model on test data X,y
            Input:
                X: Test data, format as described in fit()
                y: Test labels, format as described in fit()
        """
        X = np.array(X)
        y = np.array(y)
        yp = []
        for x in X:
            yp.append(self.predict(x))
        yp = np.array(yp)
        acc, pre, rec = stats(y,yp)
        print('Accuracy:', acc)
        print('Precision:', pre)
        print('Recall:', rec)

In [212]:
X_test = vectorizer.transform(x_test).toarray()

hm = HamClassifier()
hm.evaluate(X_test,y_test)

Accuracy: 0.8684701492537313
Precision: 0
Recall: 0


In [213]:
rd = RandomClassifier()
rd.evaluate(X_test,y_test)

Accuracy: 0.5298507462686567
Precision: 0.5035460992907801
Recall: 0.1405940594059406


## Demo, just for inspiration:

In [182]:
# toy data
corpus = [
     'This is the first document.',
     'This is the second second document.',
     'And the third one.',
     'Is this the first document?',]

# toy labels
y = [0,1,1,0]

# extract features 
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print('Vocabulary: ', vectorizer.get_feature_names())
print('Feature Matrix:', X.toarray())


# instantiate a classifier
nb = MultinomialNaiveBayesClassifier()

# fit the model
nb.fit(X.toarray(),y)

# predict class of a new document
new_text = 'This is some new document. Please classify me.'
X_new = vectorizer.transform([new_text]).toarray()
print('New Feature Matrix:',X_new)
nb.predict(X_new)

# evaluate the model
test_corpus = [
     'This is the first test document.',
     'This is the second test second document.',
     'And the third one.',
     'Is this the first document?',]

X_test = vectorizer.transform(test_corpus).toarray()
print('Feature matrix of test data:',X_test)
y_test = [0,1,1,1]

nb.evaluate(X_test,y_test)

Vocabulary:  ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
Feature Matrix: [[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]
New Feature Matrix: [[0 1 0 1 0 0 0 0 1]]
Feature matrix of test data: [[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]
Accuracy: 0.75
Precision: 1.0
Recall: 0.5
