In [1]:
import numpy
import math
def bayesian(inp,label):
    input_size = inp.shape[0]
    no_of_features = inp.shape[1]
    parameters = {}
    parameters['label'] = label
    parameters['means'] = numpy.zeros(no_of_features)
    parameters['covariances'] = numpy.zeros((no_of_features,no_of_features))
    
    mu = numpy.zeros(no_of_features)
    for i in range(input_size):
        mu += inp[i]
    mu = mu/input_size
    for j in range(no_of_features):
        for k in range(no_of_features):
            parameters['covariances'][j][k] = sum((inp[:,j]-mu[j])*(inp[:,k]-mu[k]))
    
    parameters['means'] = mu
    parameters['covariances'] /= input_size
    
    return (parameters)

In [2]:
def separate_data(inp,inp_label,label):
    data = []
    length = inp.shape[0]
    no_of_features = inp.shape[1]
    label_size = label.shape[0]
    for t in range(label_size):
        data.append(numpy.zeros((1,no_of_features)))
        for i in range(length-1,-1,-1):
            if (inp_label[i] == label[t]):
                data[t] = numpy.insert(data[t],[0],inp[i],axis = 0)
        data[t] = numpy.delete(data[t],data[t].shape[0]-1,axis = 0)
    return data

In [3]:
def get_parameter(data,labels):
    parameters = []
    for i in range(len(data)):
        parameters.append(bayesian(data[i],labels[i]))
    return parameters

In [13]:
from mnist import MNIST
import numpy as np
import random
mndata = MNIST('fMNIST')
mndata.gz = True
images, labels = mndata.load_training()

image1, label1 = mndata.load_testing()

print(len(images))

60000


In [21]:
from numpy import array
from numpy import mean
from numpy import cov
import numpy
from numpy.linalg import eig
# define a matrix
 

# calculate the mean of each column
M = mean(np.array(images).T.tolist(), axis=1)
M1 = mean(np.array(image1).T.tolist(), axis=1)
# center columns by subtracting column means

C = images - M
C1 = image1 - M1
# calculate covariance matrix of centered matrix
V = cov(C.T)
V1 = cov(C1.T)
# eigendecomposition of covariance matrix
values, vectors = eig(V)
value1,vector1 = eig(V1)
print(np.shape(vectors))

vectors = vectors[:np.shape(vectors)[0],0:50]
vector1 = vectors[:np.shape(vector1)[0],0:50]
# project data
P = vectors.T.dot(C.T)
P1 = vector1.T.dot(C1.T)

(784, 784)


In [22]:
print (P.T.shape)

(60000, 80)


In [23]:
labels = numpy.array(labels)
label1 = numpy.array(label1)
print (label1.shape)
P = numpy.real(P)
P1 = numpy.real(P1)
print (P1.T)

(10000,)
[[-1491.64251248   647.62336821   271.07341521 ...    -9.63497682
     15.09653488   -19.05139179]
 [ 1868.79227753  1068.99856796  -768.19394734 ...    22.41717051
    193.22694614   -15.74045292]
 [  381.25019335 -1543.9188205    445.63800283 ...   -20.28044108
     18.95873067    16.94919679]
 ...
 [ -515.67146259  -733.02923574   219.27603407 ...  -117.71949521
    -25.84369303   126.19822396]
 [ -226.02698884 -1419.04053866   441.37674826 ...    41.29479864
     32.76558173    18.35770909]
 [-1524.56070569    87.33388266  -263.547119   ...   -36.47556919
    108.62054404    47.29423231]]


In [24]:
possible_labels = numpy.array(range(10))
print (possible_labels)
data = separate_data(P.T,labels,possible_labels)
#get_parameters(data,parameters,possible_labels)

[0 1 2 3 4 5 6 7 8 9]


In [25]:
parameters = get_parameter(data,possible_labels)
print (parameters)

[{'label': 0, 'means': array([ 6.85822396e+02, -6.10317773e+02,  4.80127109e+01,  2.68951098e+02,
       -8.51035578e+01,  6.64172946e+02,  7.83623109e+01, -2.58629364e+01,
       -1.11899914e+01,  2.75101026e+01,  4.78820601e+01, -2.83565420e+01,
        5.94775706e+01, -1.23942654e+01,  4.23953690e+01,  2.51203908e+01,
        4.37049231e+01,  1.47931514e+01, -1.87323667e+01, -3.94916018e+01,
        9.56204497e+00, -4.08935700e+00, -3.18170358e+01, -1.65012985e+01,
        5.73711793e+00, -2.22083456e+00,  1.79074367e+01,  3.92838409e+00,
        1.91086622e+01, -1.34387303e+01, -1.27680849e+01, -6.34689434e+00,
       -4.85813495e+00,  2.66868559e+00, -7.38835781e+00, -7.91669085e+00,
       -1.43272916e+00,  6.08071482e-01, -3.32532184e+00, -2.96370866e+00,
       -4.77773625e+00,  8.41969397e+00, -1.22463369e+01,  2.52049294e-01,
       -1.14037986e+00, -2.27659730e+00,  3.33556992e-01, -8.45923323e+00,
       -2.95742084e+00, -2.58251898e+00,  2.40929478e+00, -1.90719849e+00,
  

In [26]:
from numpy.linalg import inv
from numpy.linalg import det
def predict(x,parameters):
    out = ""
    m = 0
    val = 0
    no_of_features = len(x)
    no_of_class = len(parameters)
    for i in range(no_of_class):
        mu = parameters[i]['means']
        prob = 0
        diff = x-mu
        sigma = parameters[i]['covariances']
        prob += math.exp(-1*numpy.dot(numpy.dot(diff,inv(sigma)),diff.transpose())/2) / (math.sqrt((2*math.pi)**3 * det(sigma)))
        if ( prob > val ):
            val = prob
            m = i
    out = parameters[m]['label']
    return out

In [27]:
#function to calculate accuracy of any input vector
def calc_accuracy(inp,inp_label,parameter):
    accuracy = 0
    for i in range(inp.shape[0]):
        if (inp_label[i] == predict(inp[i].astype(float),parameter)): #  if true label matches predicted label
            accuracy += 1                                              # accuracy increases by 1
    return accuracy/inp.shape[0]                                       # normalize accuracy with total data

In [None]:
accuracy = calc_accuracy(P1.T,label1,parameters)
print (accuracy)

  r = _umath_linalg.det(a, signature=signature)
