In [1]:
import numpy as np
import pandas as pd
import math
import scipy.linalg as linalg

In [2]:
data_set_labels = np.genfromtxt("hw02_labels.csv")

In [3]:
data_set_images = np.genfromtxt("hw02_images.csv",delimiter=',')

In [4]:
#groups the data and their labels
x_train = data_set_images[:30000,:]
y_train = data_set_labels[:30000]
x_test = data_set_images[30000:,:]
y_test = data_set_labels[30000:]

In [5]:
K = np.max(y_train).astype(int)
N = x_train.shape[0]

In [6]:
#calculates the sample means
sample_means= np.array([])
for i in range(K):
    for j in range(x_train.shape[1]):
        mean=np.mean(x_train[y_train==(i+1)][:,j])
        sample_means=np.append(sample_means,mean)
        
sample_means=sample_means.reshape(5,784)

In [7]:
print(sample_means)

[[254.99866667 254.98416667 254.85616667 ... 254.679      254.87816667
  254.95933333]
 [254.99733333 254.99733333 254.9965     ... 254.96883333 254.99216667
  254.98866667]
 [254.99933333 254.99933333 254.99233333 ... 251.52483333 254.4725
  254.97483333]
 [254.99666667 254.98983333 254.91416667 ... 252.39516667 254.44166667
  254.93666667]
 [254.999      254.98433333 254.93783333 ... 250.673      253.23333333
  254.79083333]]


In [8]:
#calculates the sample deviations
sample_deviations= np.array([])
for i in range(K):
    for j in range(x_train.shape[1]):
        std=np.sqrt(np.mean((x_train[y_train==(i+1)][:,j]-sample_means[i,j])**2))
        sample_deviations=np.append(sample_deviations,std)
        
sample_deviations=sample_deviations.reshape(5,784)

In [9]:
print(sample_deviations)

[[ 0.09127736  0.25609108  1.31090756 ...  5.29826629  3.9117332
   1.93959091]
 [ 0.2065419   0.2065419   0.2163818  ...  1.04076669  0.47057267
   0.70062226]
 [ 0.05163547  0.04081939  0.16002465 ... 18.43665868  6.7881694
   1.1061344 ]
 [ 0.18436076  0.21617116  1.81046936 ... 15.67799977  6.34549162
   1.79971911]
 [ 0.04471018  0.64582342  3.03248555 ... 23.62576428 13.9167006
   4.4727787 ]]


In [10]:
#calculates the class priors
class_priors = [np.mean(y_train==(c+1)) for c in range(K)]

In [11]:
print(class_priors)

[0.2, 0.2, 0.2, 0.2, 0.2]


In [12]:
#calculates the gscore functions for each class
def g_calc(x,mu,std,prior):
    scores = np.array([])
    for i in range(x.shape[0]):
        for c in range(K):
            score = np.array(np.sum(-0.5*np.log(2*math.pi*mu[c]**2)-0.5*(x[i]-mu[c])**2/std[c]**2 + np.log(prior[c])))
            scores = np.append(scores,score)
    return scores.reshape(x.shape[0],K)

In [13]:
y_train_pred = g_calc(x_train,sample_means,sample_deviations,class_priors)

In [14]:
#confusion matrix for the training part
y_train_predicted = np.argmax(y_train_pred, axis = 1) + 1
confusion_matrix = pd.crosstab(y_train_predicted, y_train, rownames = ['y_pred'], colnames = ['y_truth'])
print(confusion_matrix)

y_truth   1.0   2.0   3.0   4.0   5.0
y_pred                               
1        4436   583    16  1103    20
2         224  4035   173    74    96
3         123   775  4704  1867    33
4         971   574   933  2450   102
5         246    33   174   506  5749


In [15]:
y_test_pred = g_calc(x_test,sample_means,sample_deviations,class_priors)

In [16]:
#confusion matrix for the test part
y_test_predicted = np.argmax(y_test_pred, axis = 1) + 1
confusion_matrix1 = pd.crosstab(y_test_predicted, y_test, rownames = ['y_pred'], colnames = ['y_truth'])
print(confusion_matrix1)

y_truth  1.0  2.0  3.0  4.0  5.0
y_pred                          
1        736   91    0  199    3
2         45  711   23   13   18
3         19  112  814  289    4
4        143   79  135  416   20
5         57    7   28   83  955
