In [11]:
%run util.ipynb
%matplotlib inline

import numpy as np
import pandas as pd
from datetime import datetime
from scipy.stats import norm
from scipy.stats import multivariate_normal as mvn

In [25]:
# this implementation only works when class labels are integer values 0,1,2,...n because
# the class labels are used as indices in a numpy array

# Below is the general and naive classifier
# smoothing is used to prevent multiplying by 0

class BayesClassifier(object):
    
    def __init__(self, naive = 1):
        self.naive = naive
    
    def fit(self, X, Y, smoothing=10e-3):
        
        N, D = X.shape
        
        # Create dictionaries to store the gaussian parameters and priors
        self.gaussians = {}
        self.priors = {}
        
        # Obtain the possible output values of Y
        labels = set(Y)
        
        # Iterate through all class labels (outcomes) and find the parameters for each
        for c in labels:
            
            # grabs all data in x for each possible outcome in Y
            current_x = X[Y == c]
            
            # find mean and variance for each possible outcome data
            if self.naive:
                self.gaussians[c] = {
                'mean': current_x.mean(axis=0), # takes the mean of all N data points within each feature
                'var': current_x.var(axis=0) + smoothing #likewise as above with variance
                #the final result is an array of feature means (taken for all data points matching an outcome)
                }
            else:
                self.gaussians[c] = {
                'mean': current_x.mean(axis=0), # takes the mean of all N data points within each feature
                'var': np.cov(current_x.T) + np.eye(D) * smoothing
                    # np.cov estimates a covariance matrix, must transpose to be along dimension D instead of N
                    # np.eye creates a matrix of ones along the diagonal with D rows
                }
            # calculate the prior distribution associated with it
            self.priors[c] = float(len(Y[Y==c])) / len(Y)
            
            
    def score(self, X, Y):
        P = self.predict(X)
        return np.mean(P == Y)
    
    
    def predict(self, X):
        # grab the dimensions of the data being predicted
        N, D = X.shape
        
        # The number of possible outcomes we've stored gaussian parameters for
        K = len(self.gaussians)
        
        # Stores the posteriors for each possible outcome. size (data requiring prediction, # of outcomes)
        P = np.zeros((N, K))
        
        # Iterate through gaussian parameters (for each outcome) and calculate posterior
        for c, g in self.gaussians.items():
            mean, var = g['mean'], g['var'] # returns means (data matching class label) for each feature
            P[:, c] = mvn.logpdf(X, mean=mean, cov=var) + np.log(self.priors[c])
            # computes the log probability for each data point (across all features) given mean and covariance matrix
            # so basically tells us P(Data given Class) * P(Class)
            
        # Pick the most likely of possible outcomes for each data point (compares probability of each class given the data)
        return np.argmax(P, axis=1)

In [26]:
# Testing with the MNIST data set

if __name__ == '__main__':
    X,Y = get_data(10000)
    Ntrain = int(len(Y) / 2)
    Xtrain, Ytrain = X[:Ntrain], Y[:Ntrain]
    Xtest, Ytest = X[Ntrain:], Y[Ntrain:]
    
    model = BayesClassifier(naive = False)
    model.fit(Xtrain, Ytrain)
    print("Train accuracy:", model.score(Xtrain, Ytrain))
    print("Test accuracy:", model.score(Xtest, Ytest))

Train accuracy: 0.9988
Test accuracy: 0.9392
