Andrew Carr

In [1]:
import numpy as np
import time

from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

from sklearn import datasets


## Get data

### build normally distributed data

In [2]:
# parameters
mean, std_dev= 0,1
n = 1000
classes = [0,1]

# choose 3 features
x = np.array([[i,j,k] for (i,j,k) in 
              zip(np.random.normal(loc=mean, scale=std_dev, size=n), 
                  np.random.normal(loc=mean, scale=std_dev, size=n), 
                  np.random.normal(loc=mean, scale=std_dev, size=n))
             ])


# get classes for these features
y = np.array([c for c in np.random.choice(classes, size=n)])

x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.7)

### load cancer data

In [3]:
data = datasets.load_breast_cancer()
train_x, test_x, train_y, test_y = train_test_split(data.data, data.target, train_size=0.7)

### personal naive bayes classifier

In [4]:
class NB(object):
    """Naive Bayes Classifier Object"""
    def __init__(self):
        pass
    
    def fit(self, x,y):
        self.train_x = x
        self.train_y = y
        self.num_samples,self.num_features = self.train_x.shape
        classes = list(set(y))
        self.stats = []
        # for both classes
        for c in classes:
            features = self.train_x[self.train_y == c]
            for feature_index in range(self.num_features):
                # select the column of features
                feature = features[:,feature_index]
                
                # gather statistics
                mean = np.mean(feature)
                std_dev = np.sqrt(np.var(feature))
                
                # build and save a normal based on the data associated with the class
                self.stats.append(stats.norm(loc=mean,scale=std_dev))
                
        
    def predict(self, x):
        self.test_x = x
        self.probs = []
        self.prediction = []
        # for each data point
        for instance in self.test_x:
            probability_0 = 0
            probability_1 = 0
            for feature_index in range(len(instance)):
                # get the normal associated with the current feature and evaluate for class 0
                probability_0 += np.log(self.stats[feature_index].pdf(instance[feature_index]))
                
                # get the normal associated with the current feature and evaluate for class 1
                probability_1 += np.log(self.stats[feature_index+self.num_features].pdf(instance[feature_index]))
            
            # determine the correct class, not worrying about p(class) since we have no prior knowledge, 
            # and so assuming uniform gives no bias and has good accuracy
            self.prediction.append(np.argmax([probability_0, probability_1]))
            self.probs.append([probability_0, probability_1])
        return self.prediction

        
    def accuracy(self, y_true,y_pred):
        return accuracy_score(y_true, y_pred)
        
        

## Classifier on random data

In [5]:
clf = NB()

clf.fit(x_train,y_train)

y_pred = clf.predict(x_test)

print("test accuracy on random data {}".format(clf.accuracy(y_test, y_pred)))

test accuracy on random data 0.4633333333333333


## Classifier on cancer data

In [6]:
clf = NB()

start = time.time()
clf.fit(train_x, train_y)
print("training time {}".format(time.time() - start))

pred_y = clf.predict(test_x)

print("test accuracy on cancer data {}".format(clf.accuracy(test_y, pred_y)))

training time 0.058775901794433594
test accuracy on cancer data 0.9532163742690059


## Sklearn Classification on cancer data

In [7]:
clf = GaussianNB()

start = time.time()
clf.fit(train_x, train_y)
print("training time {}".format(time.time() - start))

pred_y = clf.predict(test_x)

print("test accuracy on cancer data {}".format(accuracy_score(test_y, pred_y)))


training time 0.0026929378509521484
test accuracy on cancer data 0.9473684210526315


# Conclusion

As can be seen above, my naive bayes classifier is competitive with the industry standard in accuracy, however it is signifiantly slower. This is not surprising since my code is obviously inefficent and naive. However, it is very exciting that it is comparable in accuracy and was quite simple to implement.