In [71]:
import numpy as np
import pandas as pd
import os.path
import math

from sklearn.model_selection import train_test_split
from scipy.stats import norm
from numpy import repeat, dot
from numpy.linalg import inv
from numpy import ndarray

In [78]:
class NaiveBayes:
    def __init__(self, X, y):
        self.X = X+np.random.normal(0,0.001, (X.shape))
        self.y = y
        self.obs, self.n = X.shape
        assert(self.obs==y.shape[0])
        self.classes = np.unique(y)
        self.nclass = len(self.classes)
        
    def _train(self):
        self.prior = [np.sum(self.y == c) / self.y.size for c in self.classes]
        self.aprior = [np.sum(self.y == c) for c in self.classes]
        self._compute_mean_and_sigma()
    
    def _predict_score(self, Xt):
        rows,n_t = Xt.shape
        posterior = np.zeros((rows, self.nclass))
        logprior = [np.log(p) for p in self.prior]
        
        assert(self.n==n_t)
        
        for j in range(rows):
            for i in range(self.nclass):
                c=self.classes[i]
                lpck=logprior[i]
                sdp=1
                standardDeviationOfThisClass=self.std[c]
                for e in range(self.n):
                    sdp *= standardDeviationOfThisClass[e]
                firstTerm = np.log(1/sdp)
                
                meanOfThisClass=self.mu[c]
                anotherTerm=0
                for e in range(self.n):
                    anotherTerm += (Xt[j,e] - meanOfThisClass[e])**2/ 2*(standardDeviationOfThisClass[e]**2)
                posterior[j,i]=firstTerm-anotherTerm+lpck
        return posterior
        
    def _predict_class(self, score):
        pred = np.argmax(score, axis=1)
        return np.array([self.classes[i] for i in pred])
        
    def predict(self, Xt):
        #print("Score to be calculated")
        score = self._predict_score(Xt)
        #print("Score calculated")
        return self._predict_class(score)
    
    def validate(self, Xt, yt):
        Xt += np.random.normal(0,0.001, (Xt.shape))
        mypredictions=self.predict(Xt)
        return self.calc_error(mypredictions, yt)
        
    def calc_error(self, ynew, target):
        correctness = np.array([yn == y for (yn,y) in zip(ynew, target)])
        return 1 - (np.sum(correctness) / target.size)
    
    def build(self):
        self.Xeach=self._split_by_class(self.X, self.y)
        self._train()
        return self
    
    def _split_by_class(self, X, y):
        return {c:X[np.where(y==c)[0],:] for c in self.classes}
    
    def _compute_mean_and_sigma(self):
        self.mu={c:np.mean(self.Xeach[c], axis=0).reshape(self.n,-1) for c in self.classes}
        self.sigma={c:np.var(self.Xeach[c], axis=0).reshape(self.n,-1) for c in self.classes}
        self.std={c:np.std(self.Xeach[c], axis=0, ddof=1).reshape(self.n,-1) for c in self.classes}
        
    

In [87]:
#select % of random sub-indices out of the given dataset
def rsubindices(dSet, percent):
    r,_ = dSet.shape
    if not r:
        return np.ndarray(shape=(0,))  
    indices=np.arange(r)
    return np.random.choice(indices, math.ceil(percent/100*len(indices)), replace=False)

def naiveBayesGaussian(filename, num_splits, train_percent=[10,25,50,75,100]):
    # from sklearn.datasets import load_boston
    # boston = load_boston()
    assert os.path.isfile(filename) and os.access(filename, os.R_OK)
    df=pd.read_csv(filename, sep=',', header = None)
    #print(df.shape)
    #df.head()
    
    data = df.as_matrix()
    X=data[:, :-1]
    y=data[:, -1]
    #assert(all(y==boston.target))
    del df, data
    
    X=X+np.random.normal(0, 0.001, X.shape) #to prevent numerical problem
    
    if len(np.unique(y))>15:
        # if the target values are more than some reasonable no (15), we take that as binary classifier
        b = np.percentile(y, 50)
        f=np.vectorize(lambda x: 0 if x<b else 1)
        y=f(y)
        assert(X.shape[0]==y.shape[0])
        y=np.reshape(y, [X.shape[0], 1])

    errormatrix = np.zeros( (num_splits, len(train_percent)) )
    # perform for num-splits iterations
    for i in range(num_splits):
        print("------------------------------------------------------------")
        print(" splitting the input data-frame into 80-20 train-test data..")
        print("------------------------------------------------------------")
        X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2)
        Y_train = np.reshape(Y_train, [X_train.shape[0], 1])
        Y_test = np.reshape(Y_test, [X_test.shape[0], 1])
        
        # percentage of data to be used for the model
        for j,p in enumerate(train_percent):
            selection = rsubindices(Y_train, p)
            train = X_train[selection, :]
            labels= Y_train[selection]
            model = NaiveBayes(train,labels).build()
            e = model.validate(X_test, Y_test)
            print("Error-rate with train-percent %s :" % p, e)
            errormatrix[i, j] = e
        print()

    errmu = np.mean(errormatrix, axis=0)
    errsigma = np.std(errormatrix, axis=0, ddof=1)
    
    print("Mean test-error"+str(errmu))
    print("Std test-error"+str(errsigma))

In [88]:
naiveBayesGaussian('boston.csv', 10)

------------------------------------------------------------
 splitting the input data-frame into 80-20 train-test data..
------------------------------------------------------------
Error-rate with train-percent 10 : 0.421568627451
Error-rate with train-percent 25 : 0.294117647059
Error-rate with train-percent 50 : 0.343137254902
Error-rate with train-percent 75 : 0.303921568627
Error-rate with train-percent 100 : 0.303921568627

------------------------------------------------------------
 splitting the input data-frame into 80-20 train-test data..
------------------------------------------------------------
Error-rate with train-percent 10 : 0.43137254902
Error-rate with train-percent 25 : 0.441176470588
Error-rate with train-percent 50 : 0.470588235294
Error-rate with train-percent 75 : 0.460784313725
Error-rate with train-percent 100 : 0.450980392157

------------------------------------------------------------
 splitting the input data-frame into 80-20 train-test data..
---------

In [90]:
naiveBayesGaussian('digits.csv', 10)

------------------------------------------------------------
 splitting the input data-frame into 80-20 train-test data..
------------------------------------------------------------
Error-rate with train-percent 10 : 0.388888888889
Error-rate with train-percent 25 : 0.405555555556
Error-rate with train-percent 50 : 0.391666666667
Error-rate with train-percent 75 : 0.361111111111
Error-rate with train-percent 100 : 0.366666666667

------------------------------------------------------------
 splitting the input data-frame into 80-20 train-test data..
------------------------------------------------------------
Error-rate with train-percent 10 : 0.6
Error-rate with train-percent 25 : 0.452777777778
Error-rate with train-percent 50 : 0.341666666667
Error-rate with train-percent 75 : 0.386111111111
Error-rate with train-percent 100 : 0.391666666667

------------------------------------------------------------
 splitting the input data-frame into 80-20 train-test data..
-------------------