In [77]:
import numpy as np
seed_data = np.loadtxt("seeds_dataset.txt")
np.random.shuffle(seed_data)

In [86]:
class gaussianBayes():
    def __init__(self, train_data, train_labels):
        train_labels = train_labels.astype(int)
        self.classes = np.unique(train_labels)
        self.nclasses = self.classes.size
        self.nfeatures = train_data.shape[1]
        self.means = np.zeros((self.nclasses, self.nfeatures))
        self.variances = np.zeros((self.nclasses, self.nfeatures))
        self.probs = np.zeros(self.nclasses)
        for i in xrange(self.nclasses):
            mask = train_data[train_labels==self.classes[i]]
            self.means[i,:] = np.mean(mask, axis=0)
            self.variances[i,:] = np.var(mask, axis=0)
            self.probs[i] = mask.shape[0]/float(train_labels.shape[0])

            
    def predict(self, test_data):
        log_probs = np.zeros((test_data.shape[0], self.nclasses))
        log_probs += np.log(self.probs)
        for c in xrange(self.nclasses):
            for f in xrange(self.nfeatures):
                mu = self.means[c, f]
                var = self.variances[c,f]
                x = test_data[:,f]
                #gaussian log density function
                log_probs[:,c] += - (x-mu)**2/(2*var) - .5 * np.log(2*np.pi*var)
        self.log_probs = log_probs
        return self.classes[np.argmax(log_probs, axis=1)]
    
    def validate(self, test_data, test_labels):
        return np.sum(self.predict(test_data)==test_labels)/float(test_labels.size)

def seed_features_and_labels(data):
    return data[:,:-1], data[:,-1]
train_features, train_labels = seed_features_and_labels(seed_data[40:,:])
test_features, test_labels = seed_features_and_labels(seed_data[:40,:])
gb = gaussianBayes(train_features, train_labels)
print "Accuracy on test data: %f " % gb.validate(test_features,test_labels)

Accuracy on test data: 0.875000 


In [89]:
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(train_features, train_labels)
print "Sklearn Accuracy %f " % nb_classifier.score(test_features, test_labels)

Sklearn Accuracy 0.875000 


In [98]:
class multi_bayes():
    def __init__(self):
        self.class_probs = None
        self.nclass = None
        self.feature_probs = None
    
    def fit(self, features, labels):
        self.classes = np.unique(labels)
        self.nclasses = len(self.classes)
        self.class_probs = np.zeros(self.nclasses)
        self.nfeatures = features.shape[1]
        self.feature_probs = np.zeros((self.nclasses, self.nfeatures))
        for i in xrange(self.nclasses):
            mask = labels == self.classes[i]
            total = np.sum(mask)
            self.class_probs[i] = total / float(len(labels))
            self.feature_probs[i,:] = (np.sum(features[mask], axis=0)+1) / float(total+1)
        
    
    def predict(self, features):
        npoints = features.shape[0]
        log_probs = np.zeros((npoints, self.nclasses))
        log_probs += np.log(self.class_probs)
        features = features.astype(int)
        for i in xrange(features.shape[0]):
            for c in xrange(self.nclasses):
                log_probs[i, c] += features[i].dot(np.log(self.feature_probs[c]))
              
        return self.classes[np.argmax(log_probs, axis=1)]

In [117]:
spamArch = np.load("spam.npz")
spamLabels = spamArch['labels']
spamFeatures = spamArch['features']

In [118]:
mb = multi_bayes()
npoints = len(spamLabels)
random_mask = np.arange(npoints)
test_mask = random_mask[:500]
train_mask = random_mask[500:]
strainf, strainl = spamFeatures[train_mask], spamLabels[train_mask]
stestf, stestl = spamFeatures[test_mask], spamLabels[test_mask]

In [119]:
mb.fit(strainf, strainl)

In [121]:
pLabels = mb.predict(stestf)

In [124]:
print "Accuracy on Spam Data %f" % (np.sum(pLabels==stestl)/500.)

Accuracy on Spam Data 0.932000
