In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import random
from itertools import chain
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import os
from chart_studio.plotly import plot, iplot
from time import time

In [None]:
pip install chart_studio

Collecting chart_studio
  Downloading chart_studio-1.1.0-py3-none-any.whl (64 kB)
[?25l[K     |█████                           | 10 kB 19.4 MB/s eta 0:00:01[K     |██████████▏                     | 20 kB 9.0 MB/s eta 0:00:01[K     |███████████████▎                | 30 kB 7.6 MB/s eta 0:00:01[K     |████████████████████▍           | 40 kB 7.3 MB/s eta 0:00:01[K     |█████████████████████████▍      | 51 kB 5.6 MB/s eta 0:00:01[K     |██████████████████████████████▌ | 61 kB 5.6 MB/s eta 0:00:01[K     |████████████████████████████████| 64 kB 1.9 MB/s 
Installing collected packages: chart-studio
Successfully installed chart-studio-1.1.0


In [9]:
from keras.datasets import mnist
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

In [10]:
class Utility:
    
    def __init__(self):
        self.train = np.asarray([np.reshape(x, (784)) for x in X_train]).astype('float64') 
        self.train_label = np.asarray(Y_train)
        self.test = np.asarray([np.reshape(x, (784)) for x in X_test]).astype('float64') 
        self.test_label = np.asarray(Y_test)
        self.n_class = list(range(10))
        
    def confusionMatrix(self, actual, predict, print_cfm=True, print_err_digit=True):
        cfm = pd.DataFrame(confusion_matrix(actual, predict))
        err_all = round((1 - sum(np.diagonal(cfm)) / len(predict)) * 100, 4)
        error_digit = []
        for i in self.n_class:
            error_digit.append(round(1 - cfm.iloc[i,i] / sum(cfm.iloc[i,:]), 4) * 100)

        tab_error = pd.DataFrame(columns=['digit', 'error per digit in %'])
        tab_error['digit'] = self.n_class
        tab_error['error per digit in %'] = error_digit

        if print_cfm:
            print('Testing Confusion Matrix: Actual vs. Prediction')
            display(cfm)

        if print_err_digit:
            print('% error per digit')
            display(tab_error)
            print('\nThe overall testing error is {}%'.format(err_all))

        return cfm, tab_error, err_all


    def nFoldCV_NB(self, train, train_label, smoothings, kFolds):
        stra_all = self.folds_stratify(nSample=len(train), kFolds=kFolds)
        ave_test_err = []
        for s, sVal in enumerate(smoothings):
            test_err = []
            for k in range(kFolds):
                stra = stra_all.copy()
                te = train[stra[k]]
                te_lb = train_label[stra[k]]
                del stra[k]  # del test list
                tr = train[list(chain.from_iterable(stra))]
                tr_lb = train_label[list(chain.from_iterable(stra))]
                nb = NaiveBayes(train=tr, train_lb=tr_lb, test=te, test_lb=te_lb, smoothing=sVal)
                nb.predict()
                conf = self.confusionMatrix(te_lb, nb.pred, False, False)
                test_err.append(conf[2])

            ave_test_err.append(np.mean(test_err))

        return ave_test_err

    def folds_stratify(self, nSample, kFolds):
        foldSize = round(nSample / kFolds, 0)
        randomList = list(range(0, nSample))
        random.shuffle(randomList)
        stratify = []

        for k in range(0, kFolds):
            strt = int(k * foldSize)
            end = int((k + 1) * foldSize)
            if k == (kFolds - 1):
                end = nSample
            stratify.append(list(randomList)[strt:end])

        return stratify
    
    def CV_plot(k_error, k_list, title):
        k_error = np.round(k_error, 4)
        best_k = k_list[np.argmin(k_error)]
        err_best_k = np.min(k_error)
        plt.plot(k_list, k_error, '-gD', color='black')
        ax = subplot(111)
        ax.set_xticks(k_list, k_list)
        plt.ylabel('% Error')
        plt.xlabel(title)
        plt.title('Plot of ' + title + ' vs. average 5 foldCV error')
        plt.show()
        print('The best value of ' + title + ' is {} with an error of {}%'.format(best_k, err_best_k))
        print('\n' + title + ' = {} will be used to model the entire training set and prediction on testing set:'.format(best_k))
        return best_k, err_best_k
        
    def images_plot(imageData):
        classes = ["P(x|c=0)", "P(x|c=1)", "P(x|c=2)", "P(x|c=3)", "P(x|c=4)", "P(x|c=5)", "P(x|c=6)", "P(x|c=7)",
                   "P(x|c=8)", "P(x|c=9)"]

        num_classes = len(classes)
        plt.subplots(figsize=(15, 2))
        for y, cls in enumerate(classes):
            plt_idx = y + 1
            plt.subplot(1, num_classes, plt_idx)
            if prob:
                plt.imshow(imageData[y].reshape((28, 28)))
            else:
                plt.imshow(imageData[y].reshape(785, 1)[1:785].reshape((28, 28)))
            plt.axis("off")
            plt.title(cls)

        plt.tight_layout()
        plt.show()
        
#End of Class Utility

class NaiveBayes:
    def __init__(self, train, train_lb, test, test_lb, smoothing, jupyter=True, plot_dis=False):
        self.n_class = np.unique(train_lb)
        self.tr = train
        self.te = test
        self.tr_lb = train_lb
        self.te_lb = test_lb
        self.plot_dis = plot_dis
        self.jupyter = jupyter
        self.smoothing = smoothing
        if jupyter:
            self.plotly = iplot
        else:
            self.plotly = plot

    def mean_std_prior(self):
        self.mean, self.std, self.priors, self.count = [], [], [], []
        for i, val in enumerate(self.n_class):
            sep = [self.tr_lb == val] #separated
            self.count.append(len(self.tr_lb[sep]))
            self.priors.append(len(self.tr_lb[sep]) / len(self.tr_lb))
            self.mean.append(np.mean(self.tr[sep], axis=0))
            self.std.append(np.std(self.tr[sep], axis=0))

        if self.plot_dis:
            bar_data = Bar(x=list(range(len(self.count))), y=self.count)
            data_plt = Data([bar_data])
            layout = Layout(yaxis=YAxis(title='counts'), xaxis=XAxis(title='classes', dtick=1),
                            title='Class distribution in Train set')
            fig = Figure(data=data_plt, layout=layout)
            self.plotly(fig)

    def predict(self):
        str_time = time()
        self.mean_std_prior()
        self.pred = []
        self.likelihood = []
        self.logsum_chk = []
        for n in range(len(self.te_lb)):
            classifier = []
            sample = self.te[n] #test sample
            likelih = []
            for i, val in enumerate(self.n_class):
                mean = self.mean[i]
                var = np.square(self.std[i]) + self.smoothing
                prob = 1 / np.sqrt(2 * np.pi * var) * np.exp(-np.square(sample - mean)/(2 * var))
                result = np.sum(np.log(prob)) #, np.log(self.priors[i])) #not needed, we assume equal prior
                classifier.append(result)
                likelih.append(prob)

            self.pred.append(np.argmax(classifier))
            self.likelihood.append(likelih)
            self.logsum_chk.append(classifier)

        self.end_time = time() - str_time


In [None]:
util = Utility()
smoothings_nb = list(range(500, 2100, 100))
kfold_nb = util.nFoldCV_NB(util.train, util.train_label, smoothings_nb, kFolds=5)



In [None]:
sm_plot = Utility.CV_plot(kfold_nb, smoothings_nb, 'smoothing')

In [None]:

util = Utility()
best_s = smoothings_nb[np.argmin(kfold_nb)]
nb = NaiveBayes(train=util.train, train_lb=util.train_label, test=util.test, test_lb=util.test_label, smoothing=best_s)
nb.predict()
conf_matix = util.confusionMatrix(util.test_label, nb.pred)

In [None]:

Utility.images_plot(nb.likelihood[2])

In [None]:
Utility.images_plot(nb.likelihood[9997])