In [1]:
import numpy as np

class DATA():
    def __init__(self, matrix, label):
        self.matrix = matrix
        self.label = label    

In [28]:
trn_data = {}
prior = {}

# Training Data
with open("optdigits-orig_train.txt") as f:
    matrix = []
    for line in f:
        if len(line) != 3:
            matrix.append(line.rstrip())
        else:
            class_idx = int(line.rstrip())
            observation = DATA(matrix, line.rstrip())
            if class_idx not in trn_data:
                trn_data[class_idx] = [observation]
            else:
                trn_data[class_idx] = trn_data[class_idx] + [observation]
            matrix = []

# Prior Probability for each digit
total = 0
for i in range(10):
    total += len(trn_data[i])
for i in range(10):
    prior[i] = len(trn_data[i]) / total
prior

# Testing Data
tst_data = {}
with open("optdigits-orig_test.txt") as f:
    matrix = []
    for line in f:
        if len(line) != 3:
            matrix.append(line.rstrip())
        else:
            class_idx = int(line.rstrip())
            observation = DATA(matrix, line.rstrip())
            if class_idx not in trn_data:
                tst_data[class_idx] = [observation]
            else:
                tst_data[class_idx] = trn_data[class_idx] + [observation]
            matrix = []

In [29]:
# Calculating all likelihood probability from trainning data
# P(f_ij = 0 | class)
# P(f_ij = 1 | class)
feature_dict_1 = {}
feature_dict_0 = {}
for i in range(10):
    feature_dict_1[i] = np.zeros((32, 32))
    feature_dict_0[i] = np.zeros((32, 32))

for class_idx in range(10):
    digit_data = trn_data[class_idx]
    feature_prob_1 = feature_dict_1[class_idx] 
    feature_prob_0 = feature_dict_0[class_idx] 
    for i in range(32):
        for j in range(32): 
            f = 0
            num = 0
            for digit in digit_data:
                if digit.matrix[i][j] == "1":
                    f += 1
                num += 1
            # Laplace Smoothing
            feature_prob_1[i][j] = (f+1)/(num+1)
            feature_prob_0[i][j] = (num - f + 1)/(num+1)

In [33]:
def cor_rate(actual, predict):
    sum = 0
    for value in predict:
        if value == actual:
            sum += 1
    return sum / len(predict)

In [31]:
# Get all the predicted label
pred_label = {}
for idx in range(10):
    digits_data = tst_data[idx]
    label = []
    for digit in digits_data:
        prob = np.zeros((10,))
        for class_idx in range(10):
            feature_class_1 = feature_dict_1[class_idx]
            feature_class_0 = feature_dict_0[class_idx]
            log_sum = np.log(prior[class_idx])
            for i in range(32):
                for j in range(32):
                    if digit.matrix[i][j] == "0":
                        log_sum += np.log(feature_class_0[i][j])
                    else:
                        log_sum += np.log(feature_class_1[i][j])

            prob[class_idx] = log_sum
            
        label.append(int(np.argmax(prob)))
        
    pred_label[idx] = label

In [36]:
# Calculating the classification rate
correct_rate = np.zeros((10,))
for i in range(10):
    correct_rate[i] = cor_rate(i, pred_label[i])
correct_rate

array([0.98755187, 0.88047809, 0.92712551, 0.94444444, 0.91322314,
       0.88655462, 0.9916318 , 0.98406375, 0.95258621, 0.88888889])

In [30]:
# testing for a single test data
trial = tst_data[9][1]
for row in trial.matrix:
    print(row)

prob = np.zeros((10,))

for class_idx in range(10):
    feature_class_1 = feature_dict_1[class_idx]
    feature_class_0 = feature_dict_0[class_idx]
    log_sum = np.log(prior[class_idx])
    for i in range(32):
        for j in range(32):
            if trial.matrix[i][j] == "0":
                log_sum += np.log(feature_class_0[i][j])
            else:
                log_sum += np.log(feature_class_1[i][j])
                
    prob[class_idx] = log_sum

print(" ")
print(prob)
print("Predicted label:", np.argmax(prob))

00000000000000000000000100000000
00000000000000000111111111000000
00000000000000001111111111100000
00000000000111111111111111000000
00000000001111111111111111000000
00000000111111111111111111000000
00000001111111100001111110000000
00000001111110000001111100000000
00000011111100000011111100000000
00000001111000001111111000000000
00000011111000111111110000000000
00000011111111111111111000000000
00000011111111111111111000000000
00000000111111111111111100000000
00000000111111100001111100000000
00000000000000000001111110000000
00000000000000000000111110000000
00000000000000000000111110000000
00000000000000000000111110000000
00000000000000000000111110000000
00000000000000000000111110000000
00000000000000000001111110000000
00000000000000000001111110000000
00000000000000000001111100000000
00000000000000000001111000000000
00000000000000000011111000000000
00000000000000000111111000000000
00000000000000001111110000000000
00000000000000001111110000000000
00000000000000001111100000000000
0000000000