### Import Packages

In [233]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd

### Create Necessary Functions

In [2]:
def bessel_var(X):
    return np.var(X) / (len(X) - 1)

In [39]:
def get_class_probs(y):
    probs = np.array([])
    for k in np.unique(y):
        probs = np.append(probs, np.sum(y == k) / len(y))
    return probs

In [241]:
def get_class_dist(X, y, col_type):
    idx = (col_type == 'continuous')
    X = X[:, idx]
    mu = np.zeros([len(np.unique(y)), X.shape[1]])
    sigma_sq = np.zeros([len(np.unique(y)), X.shape[1]])
    for i in range(len(np.unique(y))):
        idx = (y == np.unique(y)[i])
        for j in range(X.shape[1]):
            mu[i, j] =  np.mean(X[idx, j])
            sigma_sq[i, j] = bessel_var(X[idx, j])
    return (mu, sigma_sq)

In [268]:
def get_category_probs(X, y, new_X, k, row, col):
    idx = (y == np.unique(y)[k])
    return np.mean(X[idx, col] == new_X[row, col])

In [14]:
def gaussian_prob(v, mu, sigma_sq):
    return 1 / np.sqrt(2 * np.pi * sigma_sq) * np.exp(-1 * (v - mu) **2 / (2 * sigma_sq))

In [280]:
def calc_probs(X, y, new_X, col_type):
    mu, sigma_sq = get_class_dist(X, y, col_type)
    class_probs = get_class_probs(y)
    probs = np.zeros([new_X.shape[0], len(np.unique(y))])
    for row in range(new_X.shape[0]):
        for k in range(len(np.unique(y))):
            row_probs = np.array([])
            for col in range(new_X.shape[1]):
                if col_type[col] == 'continuous':
                    row_probs = np.append(row_probs, gaussian_prob(new_X[row, col], mu[k, col], sigma_sq[k, col]))
                elif col_type[col] == 'categorical':
                    row_probs = np.append(row_probs, get_category_probs(X, y, new_X, k, row, col))
            probs[row, k] = class_probs[k] * np.prod(row_probs)
    return probs

In [102]:
def select_class(probs, y):
    classes = np.unique(y)
    idx = np.argmax(probs, axis = 1)
    return np.array([classes[i] for i in idx])

In [204]:
def naive_bayes_predict(train_X, train_y, test_X, col_types):
    probs = calc_probs(train_X, train_y, test_X, col_types)
    return select_class(probs, train_y)

### Create Downsampling Function

In [298]:
def downsample(X, y, larger_class, smaller_class):
    larger_idx = np.where(y == larger_class)[0]
    smaller_idx = np.where(y == smaller_class)[0]
    new_larger_idx = np.random.choice(larger_idx, size = len(smaller_idx), replace = False)
    new_X = X[new_larger_idx, :]
    new_X = np.append(new_X, X[smaller_idx, :], axis = 0)
    new_y = y[new_larger_idx]
    new_y = np.append(new_y, y[smaller_idx])
    return (new_X, new_y)

### Create Data

In [306]:
# train_X = np.array([[10, 13, 0],
#               [12, 15, 0],
#               [11, 17, 1],
#               [12, 15, 0],
#               [20, 16, 1],
#               [19, 17, 1],
#               [21, 13, 0],
#               [21, 15, 1],
#               [20, 14, 1]])

# train_y = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1])

# test_X = np.array([[16, 16, 1],
#                    [19, 15, 0]])

# test_y = np.array([0, 1])

breast_cancer = datasets.load_breast_cancer()
wine = datasets.load_wine()
iris = datasets.load_iris()

breast_cancer_cat = pd.read_csv('./Data/Naive Bayes/breast-cancer.data').values

# X, y = downsample(breast_cancer_cat[:, 1:], breast_cancer_cat[:, 0], 'no-recurrence-events', 'recurrence-events')

train_X, test_X, train_y, test_y = train_test_split(breast_cancer_cat[:, 1:], breast_cancer_cat[:, 0],
                                                    test_size = .3, random_state = 5)


### Predict New Data

In [307]:
preds = naive_bayes_predict(train_X, train_y, test_X, ['categorical' for _ in range(train_X.shape[1])])
print(preds)
np.mean(test_y == preds)

['no-recurrence-events' 'no-recurrence-events' 'no-recurrence-events'
 'no-recurrence-events' 'recurrence-events' 'no-recurrence-events'
 'no-recurrence-events' 'recurrence-events' 'no-recurrence-events'
 'no-recurrence-events' 'no-recurrence-events' 'no-recurrence-events'
 'no-recurrence-events' 'no-recurrence-events' 'recurrence-events'
 'no-recurrence-events' 'no-recurrence-events' 'no-recurrence-events'
 'no-recurrence-events' 'recurrence-events' 'no-recurrence-events'
 'no-recurrence-events' 'no-recurrence-events' 'no-recurrence-events'
 'recurrence-events' 'no-recurrence-events' 'recurrence-events'
 'no-recurrence-events' 'no-recurrence-events' 'no-recurrence-events'
 'no-recurrence-events' 'no-recurrence-events' 'recurrence-events'
 'no-recurrence-events' 'no-recurrence-events' 'recurrence-events'
 'recurrence-events' 'no-recurrence-events' 'no-recurrence-events'
 'no-recurrence-events' 'no-recurrence-events' 'no-recurrence-events'
 'no-recurrence-events' 'no-recurrence-events' 

0.7790697674418605