In [36]:
import numpy as np
from pprint import pprint as pp

In [37]:
def create_dataset(n_rows=10, n_cols=2, prob=(0.5, 0.5), seed=None):
    '''
    creates a 2D numpy array with 0s and 1s for columns
    
    INPUT:
        n_rows = (int) number of rows in dataset
        n_cols = (int) number of columns starting with target followed by features
        prob = (tuple) probability of success for target, feature 1, feature 2, ..., feature n
    OUTPUT:
        dataset (numpy array)
    '''
    # error handling
    assert type(n_rows) == int, 'n_rows must be an integer'
    assert type(n_cols) == int, 'n_cols must be an integer'
    assert type(prob) == tuple, 'prob must be a tuple of probabilities'
    assert len(prob) == n_cols, 'tuple must contain probabilities for each n_col'
    
    # reproducibility
    if seed:
        np.random.seed(seed)
        
    # create dataset
    for i, prb in enumerate(prob):
        if i < 1:
            dataset = np.random.binomial(n=1, p=prb, size=n_rows)
        else:
            column = np.random.binomial(n=1, p=prb, size=n_rows)
            dataset = np.c_[dataset, column]
    
    return dataset

In [52]:
class MyBayes:
    """Bernoulli Naive Bayes classifier for predicting a binary target from any number of
    binary features.
    
    Use MyBayes.fit to train model
    Use MyBayes.predict to predict target classes based on feature observations"""
    
    #helper function
    def get_likelihoods(self, x, y):
        """Creates matrix of conditional likelihoods for a single feature.
        
        inputs:
        x - array of values representing observations in one feature column
        y - array of values containing the target labels corresponding to the feature column
        
        returns:
        2x2 matrix of conditional probabilities (feature given target) such that:
        row number = feature value
        col number = target value
        
        eg:
        [P(f=0 | t=0), P(f=0 | t=1)]
        [P(f=1 | t=0), P(f=1 | t=1)]"""
            
            
        self.p_f1_given_t0 = x[y==0].mean()
        self.p_f0_given_t0 = 1 - self.p_f1_given_t0
        
        self.p_f1_given_t1 = x[y==1].mean()
        self.p_f0_given_t1 = 1 - self.p_f1_given_t1
        
        self.feature_likelihoods = np.ndarray([2,2])
        
        self.feature_likelihoods[0,0] = self.p_f0_given_t0
        self.feature_likelihoods[0,1] = self.p_f0_given_t1
        self.feature_likelihoods[1,0] = self.p_f1_given_t0
        self.feature_likelihoods[1,1] = self.p_f1_given_t1
            
        return self.feature_likelihoods
    
    #helper function
    def predict_single_obs(self, X):
        """Predicts the target label for a single observation given feature values.
        
        input:
        X - array of feature values for a single observation
        
        returns:
        1 or 0 - the predicted label for this observation, given the feature values
        """
        
        self.target0_likelies = [likely_array[0, X[i]] for i, likely_array in enumerate(self.likelihoods)]
        self.target1_likelies = [likely_array[1, X[i]] for i, likely_array in enumerate(self.likelihoods)]
        
        self.posterior0 = self.prior0 * np.prod(self.target0_likelies)
        self.posterior1 = self.prior1 * np.prod(self.target1_likelies)
        
        if self.posterior0 > self.posterior1:
            return 0
        else:
            return 1
    
    
    
    def fit(self, data):
        """Trains the Naive Bayes model on a matrix of feature/target data.
        
        inputs:
        data - numpy matrix with target values in first column, feature values in subsequent columns
        
        returns:
        self
        """
        
        #splits data into target array and feature matrix
        self.target = data[:,0]
        self.features = data[:,1:]
        
        #predicts prior probability for both 0 and 1 target labels
        self.prior1 = self.target.mean()
        self.prior0 = 1 - self.prior1
        
        self.likelihoods = []
        
        #gets matrix of conditional probabilities for each feature as related to the target
        for feature in self.features.T:
            self.feature_likelihood_matrix = self.get_likelihoods(feature, self.target)
            self.likelihoods.append(self.feature_likelihood_matrix)
        
        return self
    
    
    def predict(self, Xs):
        '''Predicts the target label for one or more observations given feature values.
        
        input:
        Xs - an array (single observation) or matrix (multiple observations) of feature values to base
             prediction(s) on
             
        returns:
        predictions - array of the predicted class label for each observation'''
        
        self.num_features = Xs.shape[0]
        self.predictions = np.zeros(self.num_features, dtype=int)
        
        for index, observation in enumerate(Xs):
            self.single_prediction = self.predict_single_obs(observation)
            self.predictions[index] = self.single_prediction
        
        return self.predictions

In [60]:
train = create_dataset(n_cols=3, prob=(0.5, 0.7, 0.2), seed=420)
test = np.random.randint(0, high=2, size=[5,10])

In [63]:
mb = MyBayes()

In [64]:
mb.fit(train)

<__main__.MyBayes at 0x10b9594a8>

In [65]:
mb.predict(test)

array([0, 1, 0, 0, 0])