In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from scipy.stats import multivariate_normal as mvn
from scipy.stats import multinomial as mlvn
from scipy.stats import bernoulli as brn
%matplotlib inline

In [5]:
train_data = np.load('kmnist-train-imgs.npz')['arr_0']
train_labels = np.load('kmnist-train-labels.npz')['arr_0']
test_data = np.load('kmnist-test-imgs.npz')['arr_0']
test_labels = np.load('kmnist-test-labels.npz')['arr_0']

In [6]:
print(train_data.shape)
print(train_labels.shape)
print(test_data.shape)
test_labels.shape

(60000, 28, 28)
(60000,)
(10000, 28, 28)


(10000,)

In [7]:
norm_train = train_data / np.linalg.norm(train_data)
norm_train_labels = train_labels / np.linalg.norm(train_labels)
norm_test = test_data / np.linalg.norm(test_data)
norm_test_labels = test_labels / np.linalg.norm(test_labels)


In [8]:
x_train = train_data.reshape(-1, 784)
y_train = train_labels
x_test = test_data.reshape(-1, 784)
y_test = test_labels

In [9]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
y_test.shape

(60000, 784)
(60000,)
(10000, 784)


(10000,)

In [10]:
class GaussNB():
    def fit(self, X, y, epsilon = 1e-3):
        self.likelihoods = dict()
        self.priors = dict()
        
        self.K = set(y.astype(int))
        
        for k in self.K:
            X_k = X[y == k,:]
            self.likelihoods[k] = {"mean":X_k.mean(axis=0), "cov":X_k.var(axis=0) + epsilon}
            self.priors[k] = len(X_k)/len(X)
            
    def predict(self, X):
        N, D = X.shape
        
        P_hat = np.zeros((N,len(self.K)))
        
        for k, l in self.likelihoods.items():
            P_hat[:,k] = mvn.logpdf(X, l["mean"], l["cov"]) + np.log(self.priors[k])
        return P_hat.argmax(axis = 1)

In [11]:
gnb = GaussNB()
gnb.fit(x_train,y_train)
y_hat = gnb.predict(x_test)

In [12]:
def accuracy(y, y_hat):
    return np.mean(y == y_hat)

In [13]:
print(f"Accuracy: {accuracy(y_test, y_hat):0.3f}")

Accuracy: 0.466


In [14]:
class GaussBayes():
    def fit(self, X, y, epsilon = 1e-3):
        self.likelihoods = dict()
        self.priors = dict()
        
        self.K = set(y.astype(int))
        
        for k in self.K:
            X_k = X[y == k,:]
            N_k, D = X_k.shape
            mu_k=X_k.mean(axis=0)
            self.likelihoods[k] = {"mean":X_k.mean(axis=0), "cov":(1/(N_k-1))*np.matmul((X_k-mu_k).T,X_k-mu_k)+ epsilon*np.identity(D)}
            self.priors[k] = len(X_k)/len(X)
            
    def predict(self, X):
        N, D = X.shape
        
        P_hat = np.zeros((N,len(self.K)))
        
        for k, l in self.likelihoods.items():
            P_hat[:,k] = mvn.logpdf(X, l["mean"], l["cov"]) + np.log(self.priors[k])
            
        return P_hat.argmax(axis = 1)

In [15]:
GB = GaussBayes()
GB.fit(x_train,y_train)
y2_hat = GB.predict(x_test)

In [16]:
print(f"Accuracy: {accuracy(y_test, y2_hat):0.3f}")

Accuracy: 0.585


In [17]:
class GenGaussBayes():
    def fit(self, X, y, epsilon = 1e-3):
        self.likelihoods = dict()
        self.priors = dict()
        
        self.K = set(y.astype(int))
        
        for k in self.K:
            X_k = X[y == k,:]
            N_k, D = X_k.shape
            mu_k=X_k.mean(axis=0)
            self.likelihoods[k] = {"mean":X_k.mean(axis=0), "cov":(1/(N_k-1))*np.matmul((X_k-mu_k).T,X_k-mu_k)+ epsilon*np.identity(D)}
            self.priors[k] = len(X_k)/len(X)
            
    def predict(self, X,DistFam):
        N, D = X.shape
        
        P_hat = np.zeros((N,len(self.K)))
        
        for k, l in self.likelihoods.items():
            P_hat[:,k] = DistFam(X, l["mean"], l["cov"]) + np.log(self.priors[k])
            
        return P_hat.argmax(axis = 1)

In [18]:
def DSel(dstring):
    if dstring=="Gauss" or dstring== "Gaussian" or dstring=="gauss" or dstring=="gaussian":return mvn.logpdf
    if dstring=="multi" or dstring== "Multinomial" or dstring=="multinomial" or dstring=="Multi":return mlvn.logpmf   



In [19]:

my_dist=mvn.logpdf
GGB = GenGaussBayes()
GGB.fit(x_train,y_train)
y3_hat = GGB.predict(x_test,DSel("gauss"))

In [20]:
print(f"Accuracy: {accuracy(y_test, y3_hat):0.3f}")

Accuracy: 0.585


In [21]:
class GenMultBayes():
    
    def fit(self, X, y,DisStr, epsilon = 1e-3):
        self.likelihoods = dict()
        self.priors = dict()
        
        self.K = set(y.astype(int))
        if DisStr=="Gauss":
        
            for k in self.K:
                X_k = X[y == k,:]
                N_k, D = X_k.shape
                mu_k=X_k.mean(axis=0)
                self.likelihoods[k] = {"mean":X_k.mean(axis=0), "cov":(1/(N_k-1))*np.matmul((X_k-mu_k).T,X_k-mu_k)+ epsilon*np.identity(D)}
                self.priors[k] = len(X_k)/len(X)
            return
        if DisStr=="Multinomial":
            for k in self.K:
                X_k = X[y == k,:]
                N_k, D = X_k.shape
                mu_k=X_k.mean(axis=0)
                #problems here
                self.likelihoods[k] = {"N":N_k, "P":sum(N_k/len(X))}
                self.priors[k] = len(X_k)/len(X)
            
        if DisStr=="Bernoulli":
            for k in self.K:
                X_k = X[y == k,:]
                N_k, D = X_k.shape
               
                self.likelihoods[k] = {"P":N_k/len(X)}
                self.priors[k] = len(X_k)/len(X)
            
    def predict(self, X,DistStr):
        N, D = X.shape
        
        if DisStr=="Gauss":
            P_hat = np.zeros((N,len(self.K)))

            for k, l in self.likelihoods.items():
                P_hat[:,k] = mvn.logpdf(X, l["mean"], l["cov"]) + np.log(self.priors[k])

            return P_hat.argmax(axis = 1)
        
        if DisStr=="Multinomial":
            P_hat = np.zeros((N,len(self.K)))

            for k, l in self.likelihoods.items():
                P_hat[:,k] = mlvn.logpmf(X, l["N"], l["P"]) + np.log(self.priors[k])

            return P_hat.argmax(axis = 1)

        if DisStr=="Bernoulli":
            P_hat = np.zeros((N,len(self.K)))

            for k, l in self.likelihoods.items():
                P_hat[:,k] = bernoulli.logpmf(X,l["P"]) + np.log(self.priors[k])

            return P_hat.argmax(axis = 1)


In [22]:

GMB = GenMultBayes()
GMB.fit(x_train,y_train, DisStr='Multinomial')
y4_hat = GMB.predict(x_test)

TypeError: 'float' object is not iterable

In [25]:
class BernNB():
  def fit(self, X, y, epsilon = 1e-2):
    N, D = X.shape
    self.likelihoods = {}
    self.priors = {}
    self.K = set(y.astype(int))

    for k in self.K:
      X_k = X[y==k,:]
      p = (sum(X_k)+1) / (len(X_k)+2)
      self.likelihoods[k] = {'mean': p, 'cov': p * (1 - p) + epsilon}
      self.priors[k] = len(X_k)/len(X)

  def predict(self, X):
    N, D = X.shape
    P_hat = np.zeros((N, len(self.K)))

    for k,l in self.likelihoods.items():
      # Using the Bernoulli funtion/formula. Trick is to get the matrices/vectors to go from mxn to a 1x1 number for each k value.
      P_hat[:,k] = np.log(self.priors[k]) + np.matmul(X, np.log(l['mean'])) + np.matmul((1 - X), np.log(abs(1-l['mean'])))

    return P_hat.argmax(axis =1)

In [26]:
bnb = BernNB()
bnb.fit(x_train,y_train) # Use the X and Y Training set here
yb_hat = bnb.predict(x_test) # Use the X Test here
print(accuracy(y_test, yb_hat))

  P_hat[:,k] = np.log(self.priors[k]) + np.matmul(X, np.log(l['mean'])) + np.matmul((1 - X), np.log(abs(1-l['mean'])))
  P_hat[:,k] = np.log(self.priors[k]) + np.matmul(X, np.log(l['mean'])) + np.matmul((1 - X), np.log(abs(1-l['mean'])))


0.0948
