# Bernoulli Naive Bayes

In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
import math
import scipy
%matplotlib inline
plt.style.use('seaborn')

In [2]:
import numpy as np
import sklearn
import matplotlib
import pandas as pd
import sys
libraries = (('Matplotlib', matplotlib), ('Numpy', np), ('Pandas', pd))

print("Python Version:", sys.version, '\n')
for lib in libraries:
    print('{0} Version: {1}'.format(lib[0], lib[1].__version__))

Python Version: 3.6.2 |Anaconda, Inc.| (default, Sep 21 2017, 18:29:43) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] 

Matplotlib Version: 2.0.2
Numpy Version: 1.13.1
Pandas Version: 0.20.3


In [3]:
import pandas as pd
import numpy as np
from collections import defaultdict

class bernoulli_naive_bayes:
    
    def __init__(self, smoothing = 1.):
        self._prob_by_class = defaultdict(float)
        self._cond_probs = defaultdict(lambda: defaultdict(float))
        self._data_cols = None
        self._smoothing = smoothing
    
    def fit(self, X, y):
        X = self.pandas_to_numpy(X)
        y = self.pandas_to_numpy(y)
        if not self._data_cols:
            try: 
                self._data_cols = X.shape[1]
            except IndexError:
                self._data_cols = 1
        X = self.check_feature_shape(X)
                
        self._classes = np.unique(y)
        for cl in self._classes:
            self._prob_by_class[cl] = len(y[y == cl])/len(y)
            denom = len(y[y == cl])
            filt = (y == cl)
            filtered_data = X[filt]
            for col in range(self._data_cols):
                binarized_column = filtered_data.T[col] > 0
                self._cond_probs[cl][col] = (np.sum(binarized_column)+self._smoothing)/(denom+self._smoothing) 
                
    def predict(self, X):
        X = self.pandas_to_numpy(X)
        X = self.check_feature_shape(X)
        X = (X > 0).astype(int) # convert to 1 or 0
        results = []
        for row in X:
            beliefs = []
            for cl in self._classes:
                prob_for_class = self._prob_by_class[cl]
                for col in range(self._data_cols):
                    p = self._cond_probs[cl][col]
                    prob_for_class *= (p*row[col] + (1-p)*(1-row[col]))
                beliefs.append([cl, prob_for_class])
            sort_beliefs = sorted(beliefs, key=lambda x: x[1], reverse=True)
            results.append(sort_beliefs[0][0])
        return results
    
    def score(self, X, y):
        """
        Uses the predict method to measure the accuracy of the model.
        ---
        In: X (list or array), feature matrix; y (list or array) labels
        Out: accuracy (float)
        """
        pred = self.predict(X)
        correct = 0
        for i,j in zip(y,pred):
            if i == j:
                correct+=1
        return float(correct)/float(len(y))
      
    def check_feature_shape(self, X):
        """
        Helper function to make sure any new data conforms to the fit data shape
        ---
        In: numpy array, (unknown shape)
        Out: numpy array, shape: (rows, self.data_cols)"""
        return X.reshape(-1,self._data_cols)
            
    
    def pandas_to_numpy(self, x):
        """
        Checks if the input is a Dataframe or series, converts to numpy matrix for
        calculation purposes.
        ---
        Input: X (array, dataframe, or series)
        
        Output: X (array)
        """
        if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
            return np.array(x)
        if type(x) == type(np.array([1,2])):
            return x
        return np.array(x)

In [4]:
from sklearn.datasets import load_iris
def get_data():
    votes = [0,1]
    senators = np.random.choice(votes, replace=True, size=(100,4))
    df = pd.DataFrame(senators, columns=['vote1','vote2','vote3','vote4'])
    
    def calculate_party(row):
        x = row['vote1']
        y = row['vote2']
        z = row['vote3']

        party = 0.7*x + 0.5*y - z + np.random.normal(0,0.3)
        if party > 0.1:
            return 'Dem'
        elif party > 0.01:
            return 'Ind'
        else:
            return 'Rep'
    
    df['party'] = df.apply(calculate_party,axis=1)
    print(df.party.value_counts())
    return df.iloc[:,:-1],df.iloc[:,-1]
    

In [5]:
X, y = get_data()

Dem    52
Rep    44
Ind     4
Name: party, dtype: int64


In [7]:
nb = bernoulli_naive_bayes()
nb.fit(X.iloc[:90],y.iloc[:90])

In [8]:
nb._cond_probs

defaultdict(<function __main__.bernoulli_naive_bayes.__init__.<locals>.<lambda>>,
            {'Dem': defaultdict(float,
                         {0: 0.65957446808510634,
                          1: 0.63829787234042556,
                          2: 0.19148936170212766,
                          3: 0.55319148936170215}),
             'Ind': defaultdict(float,
                         {0: 0.40000000000000002,
                          1: 0.80000000000000004,
                          2: 0.40000000000000002,
                          3: 0.40000000000000002}),
             'Rep': defaultdict(float,
                         {0: 0.41463414634146339,
                          1: 0.31707317073170732,
                          2: 0.85365853658536583,
                          3: 0.53658536585365857})})

In [9]:
nb._prob_by_class

defaultdict(float,
            {'Dem': 0.5111111111111111,
             'Ind': 0.044444444444444446,
             'Rep': 0.4444444444444444})

In [10]:
nb.predict(X.iloc[0:2])

['Rep', 'Rep']

In [11]:
nb.score(X.iloc[90:],y.iloc[90:])

0.8

In [12]:
from sklearn.naive_bayes import BernoulliNB

nb_sk = BernoulliNB()
nb_sk.fit(X.iloc[:90],y.iloc[:90])
nb_sk.score(X.iloc[90:],y.iloc[90:])

0.80000000000000004