In [4]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
np.set_printoptions(threshold=np.nan, precision=2)

In [5]:
class Digit(object):
    '''
        one digit object represents one class of tokens;
        digit: 0--9;
        tokens: list of matrix/image;
    '''
    def __init__(self, digit):
        '''
            digit: 0--9
            tokens: all the related images
            matrix_zero: the frequency matrix of 0
            matrix_one: the frequency matrix of 1
        '''
        self.digit = digit
        self.tokens = []
        self.matrix_one = np.zeros((32, 32))
        self.matrix_zero = np.zeros((32, 32))
        self.likelihoods_zero = np.zeros((32, 32))
        self.likelihoods_one = np.zeros((32, 32))

    def add_token(self, image):
        '''
            image is raw data, a list of str-lists;
            ["00000000000000", "0000001000000"]
            Add a image to the list;
        '''
        image = list(map(lambda x: x.rstrip(), image)) # Erase "\n"
        image2str = "".join(image)
        str2list = [int(i) for i in image2str]
        matrix = np.array(str2list).reshape(32, 32)
        self.tokens.append(matrix)
        return matrix

    def print_tokens(self):
        '''
            Print all the tokens in this class.
        '''
        for i in self.tokens:
            print(i)

    def print_token(self, index):
        '''
            Print a specific token in this class.
        '''
        print(self.tokens[index])

    def number_of_tokens(self):
        return len(self.tokens)

    def frequency_matrix(self):
        '''
            Calculate the # of times pixel (i, j) has value 0/1
            return two matrice
        '''
        for token in self.tokens:
            for i in range(32):
                for j in range(32):
                    if token[i][j] == 1:
                        self.matrix_one[i][j] += 1
                    else:
                        self.matrix_zero[i][j] += 1
        # print(self.matrix_one)
        return self.matrix_zero, self.matrix_one

    def likelihoods(self, laplace=0.1):
        '''
            Calculate the matrix of likelihoods for the class
            return two matrice.
        '''
        self.frequency_matrix()
        V = len(self.tokens)
        self.likelihoods_zero = self.matrix_zero
        self.likelihoods_one = self.matrix_one

        self.likelihoods_zero = (self.likelihoods_zero + laplace) / (laplace*V + V)
        self.likelihoods_one  = (self.likelihoods_one + laplace) / (laplace*V + V)

        return self.likelihoods_zero, self.likelihoods_one


In [50]:
class NB(object):
    def __init__(self, training_data, testing_data):
        '''
            training_data is a dictionary of all the classes(0--9) from training_data. Details are denoted in the Digit.py
            testing_data is a dictionary of the classes(0--9) from testing_data. Details are denoted in the Digit.py
                key: digit index
                value: digit object
            priors is a dictionary of priors probability for each digit.
                key: digit index
                value: probability
            posteriori is a dictionary of posteriori probability of each digit.
                key: digit index
                value: probability
        '''
        self.training_data = training_data
        self.testing_data = testing_data
        self.priors = {}
        self.likelihoods_zero = {}
        self.likelihoods_one = {}
        self.posteriori = {}

    def calculate_priors(self):
        '''
            Calculate priors probability for each digit, according to training_data set.
            To make life easier, log them.
        '''
        total_count = 0
        for index, digit_object in self.training_data.items():
            self.priors[index] = digit_object.number_of_tokens()
            total_count += digit_object.number_of_tokens()
        for k, v in self.priors.items():
            self.priors[k] /= total_count
            self.priors[k] = math.log(self.priors[k])
        return self.priors

    def all_likelihoods(self, laplace):
        '''
            Obtain the likehoods from the training data set.
            First loop: obtain the matrix of likelihoods for each digit.
            Second & Third loop: log them.
        '''
        self.calculate_priors()
        for class_idx, class_obj in self.training_data.items():
            self.likelihoods_zero[class_idx], self.likelihoods_one[class_idx] = class_obj.likelihoods(laplace)
            # print(self.likelihoods_zero[class_idx])
        for k, v in self.likelihoods_zero.items():
            self.likelihoods_zero[k] = np.log(v)
        for k, v in self.likelihoods_one.items():
            self.likelihoods_one[k] = np.log(v)

    def calculate_posteriori(self, image, digit):
        '''
            Posteriori:
                P(class|observed) = log(prior) + log(specific likelihood)
            Given digit(prior), calclulate posteriori for this image.
                Observation is this image. To be specific, each pixel.
        '''
        posteriori = self.priors[digit]
        # print(self.priors)
        for (x, y), value in np.ndenumerate(image):
            if value == 0:
                posteriori += self.likelihoods_zero[digit][x][y]
            else:
                posteriori += self.likelihoods_one[digit][x][y]
        # print(posteriori)
        return posteriori

    def MAP_evaluation(self, image):
        '''
            For one image, what is the posteriori of each image? Return the greatest prediction.
        '''
        posteriori_list = []
        # for image in self.testing_data[digit].tokens:
        for digit in range(10):
            posteriori_list.append(self.calculate_posteriori(image, digit))
        return np.argmax(np.array(posteriori_list) )

    def confusion_matrix(self, laplace=0.1):
        self.all_likelihoods(laplace)
        confusion_matrix = np.zeros((10, 10))
        for class_idx, obj_idx in self.testing_data.items():
            prediction_count = np.zeros(10)
            for image in obj_idx.tokens:
                prediction = self.MAP_evaluation(image)
                prediction_count[prediction] += 1
            prediction_percent = prediction_count / prediction_count.sum()
            confusion_matrix[class_idx] = prediction_percent
        return confusion_matrix

    def odds_ratio(self, a, b):
        self.all_likelihoods(0.1)
        fig, ax = plt.subplots(1, 3)
        pcm = ax[0].imshow(self.likelihoods_one[a], cmap="jet", vmin=-3, vmax=3)
        fig.colorbar(pcm, ax=ax[0])
        pcm = ax[1].imshow(self.likelihoods_one[b], cmap="jet")
        fig.colorbar(pcm, ax=ax[1])
        pcm = ax[2].imshow(self.likelihoods_one[a]-self.likelihoods_one[b], cmap="jet", vmin=-3, vmax=3)
        fig.colorbar(pcm, ax=ax[2])
        plt.savefig("foo.png")
        fig.show()
        
    def accuracy(self, laplace):
        matrix = self.confusion_matrix(laplace)
        output = []
        for i in range(10):
            output.append(matrix[i][i])
        return output


In [51]:
training_data = load_training()
testing_data = load_testing()

nb = NB(training_data, testing_data)

In [44]:
matrix = nb.confusion_matrix()

pd.DataFrame(matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.972222,0.0,0.0,0.0,0.027778,0.0,0.0,0.0,0.0,0.0
1,0.0,0.933333,0.0,0.0,0.0,0.0,0.0,0.022222,0.022222,0.022222
2,0.0,0.0,0.853659,0.0,0.0,0.0,0.0,0.0,0.121951,0.02439
3,0.0,0.0,0.0,0.909091,0.0,0.0,0.0,0.030303,0.0,0.060606
4,0.0,0.0,0.0,0.0,0.881356,0.0,0.0,0.067797,0.050847,0.0
5,0.0,0.0,0.0,0.0,0.0,0.931034,0.0,0.0,0.0,0.068966
6,0.0,0.0,0.0,0.0,0.023256,0.0,0.976744,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,0.0,0.0,0.0,0.02381,0.0,0.0,0.0,0.02381,0.0,0.952381


In [45]:
data = [0.1 + i for i in range(11)]
laplace_test = {}
for i in data:
    laplace_test[i] = nb.accuracy(i)

pd.DataFrame(laplace_test)

Unnamed: 0,0.1,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1,9.1,10.1
0,0.972222,0.972222,0.972222,0.972222,0.972222,0.972222,0.972222,0.972222,0.972222,0.972222,0.972222
1,0.933333,0.933333,0.933333,0.933333,0.933333,0.933333,0.933333,0.933333,0.933333,0.933333,0.933333
2,0.853659,0.853659,0.853659,0.853659,0.853659,0.853659,0.853659,0.853659,0.853659,0.853659,0.853659
3,0.909091,0.909091,0.909091,0.909091,0.909091,0.909091,0.909091,0.909091,0.909091,0.909091,0.909091
4,0.898305,0.881356,0.881356,0.881356,0.881356,0.881356,0.881356,0.881356,0.881356,0.881356,0.881356
5,0.931034,0.931034,0.931034,0.931034,0.931034,0.931034,0.931034,0.931034,0.931034,0.931034,0.931034
6,0.976744,0.976744,0.976744,0.976744,0.976744,0.976744,0.976744,0.976744,0.976744,0.976744,0.976744
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,0.952381,0.952381,0.952381,0.952381,0.952381,0.952381,0.952381,0.952381,0.952381,0.952381,0.952381


In [54]:
nb.odds_ratio(1, 8)
# This will create local file of likelihoods for 1, 8 and odds_ratio for one over eight.

  "matplotlib is currently using a non-GUI backend, "
