In [507]:
import numpy as np

class DATA():
    def __init__(self, matrix, label):
        self.matrix = matrix
        self.label = label   
        self.feature = None

In [508]:
def import_data(source, classified):
    if classified:
        digits_data = {}
    else:
        digits_data = []
        
    with open(source) as f:
        matrix = []
        vector = []
        for line in f:
            if len(line) > 10:
                for element in line.rstrip():  
                    vector.append(int(element))
                matrix.append(line.rstrip())
            else:
                class_idx = int(line.rstrip())
                observation = DATA(matrix, class_idx)
                observation.feature = np.array(vector)
                
                if classified:
                    if class_idx not in digits_data:
                        digits_data[class_idx] = [observation]
                    else:
                        digits_data[class_idx] = digits_data[class_idx] + [observation]
                    matrix = []
                    vector = []    
                else:
                    digits_data.append(observation)
                    matrix = []
                    vector = []
                    
    return digits_data

In [617]:
def eval_prior(trn_data):
    prior = {}
    total = 0
    for i in range(10):
        total += len(trn_data[i])
    for i in range(10):
        prior[i] = len(trn_data[i]) / total
    return prior 

def likelihood(trn_data, target):
    feature_dict = {}
    for i in range(10):
        feature_dict[i] = np.zeros((32, 32))       
    for class_idx in range(10):
        digit_data = trn_data[class_idx]
        feature_prob = feature_dict[class_idx] 
        for i in range(32):
            for j in range(32): 
                f = 0
                num = 0
                for digit in digit_data:
                    if digit.matrix[i][j] == target:
                        f += 1
                    num += 1
                # Laplace Smoothing
                feature_prob[i][j] = (f+1)/(num+1)
                
    return feature_dict

def prediction(tst_data, prior, feature_dict_0, feature_dict_1):
    pred_label = {}
    for idx in range(10):
        digits_data = tst_data[idx]
        label = []
        for digit in digits_data:
            prob = np.zeros((10,))
            for class_idx in range(10):
                feature_class_1 = feature_dict_1[class_idx]
                feature_class_0 = feature_dict_0[class_idx]
                log_sum = np.log(prior[class_idx])
                for i in range(32):
                    for j in range(32):
                        if digit.matrix[i][j] == "0":
                            log_sum += np.log(feature_class_0[i][j])
                        else:
                            log_sum += np.log(feature_class_1[i][j])

                prob[class_idx] = log_sum
                
            label.append(int(np.argmax(prob)))
        pred_label[idx] = label
    return pred_label

def cor_rate(actual, predict):
    sum = 0
    for value in predict:
        if value == actual:
            sum += 1
    return sum / len(predict)

def correct_rate(pred_label):
    rate = np.zeros((10,))
    for i in range(10):
        rate[i] = cor_rate(i, pred_label[i])
    return rate

def correct_rate_overall(pred_label):
    hit = 0
    total = 0
    for i in range(10):
        for label in pred_label[i]:
            if label == i:
                hit += 1
            total += 1
    return hit / total

In [510]:
trn_data = import_data("optdigits-orig_train.txt", True)
tst_data = import_data("optdigits-orig_test.txt", True)
prior = eval_prior(trn_data)

## Part 1.1: _Naive Bayes Classifier:_

In [511]:
feature_dict_0 = likelihood(trn_data, "0")
feature_dict_1 = likelihood(trn_data, "1")
pred_label = prediction(tst_data, prior, feature_dict_0, feature_dict_1)

correct_rate(pred_label)

array([0.97222222, 0.93333333, 0.85365854, 0.90909091, 0.88135593,
       0.93103448, 0.97674419, 0.9787234 , 1.        , 0.92857143])

## Part 2.1: _Perceptron:_

Apply the multi-class (non-differentiable) perceptron learning rule from lecture to the digit classification problem from Part 1.1. As before, the basic feature set consists of a single binary indicator feature for each pixel. Specifically, the feature $F_{i,j}$ indicates the status of the (i,j)-th pixel. Its value is 1 if the pixel contains value 1, and 0 if it is 0. The images are of size 32*32, so there are 1024 features in total. For a multi-class perceptron, you need to learn a weight vector for each digit class. Each component of a weight vector corresponds to the weight of a pixel, which makes it of length either 1024 (without bias) or 1025 (with bias).

To get your results, you should tune the following parameters (it is not necessary to separately report results for multiple settings, only report which options you tried and which one worked the best):

- Learning rate decay function;
- Bias vs. no bias;
- Initialization of weights (zeros vs. random);
- Ordering of training examples (fixed vs. random);
- Number of epochs.

In [619]:
from random import randint
import random

def prediction(tst_data, w):
    pred = {}
    for class_idx in range(10):
        tst_data_i = tst_data[class_idx]
        label = []
        for data in tst_data_i:
            c = np.zeros((10,))
            for i in range(10):
                c[i] = data.feature @ w[i]
            label.append(int(np.argmax(c)))
        pred[class_idx] = label 
        
    return pred

def cor_rate(actual, predict):
    sum = 0
    for value in predict:
        if value == actual:
            sum += 1
    return sum / len(predict)

def correct_rate(pred_label):
    rate = np.zeros((10,))
    for i in range(10):
        rate[i] = cor_rate(i, pred_label[i])
    return rate

def correct_rate_overall(pred_label):
    hit = 0
    total = 0
    for i in range(10):
        for label in pred_label[i]:
            if label == i:
                hit += 1
            total += 1
    return hit / total

In [626]:
# Import overall training data
trn_data_unclassified = import_data("optdigits-orig_train.txt", classified = False)
trn_data = import_data("optdigits-orig_train.txt", True)
tst_data = import_data("optdigits-orig_test.txt", True)

- _**Implementing the Perceptron:**_

In [629]:
results = {}

# Tuning Parameters:
w = np.zeros((10, 1024))
#w = np.random.rand(10, 1024)

epochs = 10
bias = 500
random_ordering = True

# Start:
for n in range(epochs):
    train_data = trn_data_unclassified[:]
    if random_ordering:
        random.shuffle(train_data)
    
    for idx, digit in enumerate(train_data):
        eta = 1 / (0.05 * idx + 1) # Learning rate decay function
        
        for class_idx in range(10):
            if digit.label == class_idx:
                y = 1
            else:
                y = -1
            if (digit.feature @ w[class_idx] + bias) * y <= 0:
                w[class_idx] = w[class_idx] + eta * y * digit.feature
    
    pred = prediction(trn_data, w)
    rate_overall = correct_rate_overall(pred)
    results[n] = rate_overall

    
# Get predicted label for each test data
pred = prediction(tst_data, w)

# Get correction rate
correct_rate_overall(pred)

0.9481981981981982

In [630]:
results

{0: 0.9351395730706076,
 1: 0.9330870279146142,
 2: 0.9421182266009852,
 3: 0.9466338259441708,
 4: 0.9490968801313628,
 5: 0.9573070607553367,
 6: 0.9610016420361248,
 7: 0.9601806239737274,
 8: 0.9642857142857143,
 9: 0.9679802955665024}

- _**Get predicted label and calculate correction rate**_:

In [614]:
w = np.random.rand(10, 1024)

w[1]

array([0.35468167, 0.27812407, 0.67124813, ..., 0.1138031 , 0.06116693,
       0.53499403])

In [230]:
# i = 5
# print("Label:", trn_data[i].label)
# trn_data[i].matrix

In [442]:
# # initialization of weights zero
# w = np.zeros((10,1024))

# w

In [127]:
# # eta = 1/n
# def learning_rate(n):
#     return 1/n

In [128]:
# def perceptron(epochs, w, bias, ordering, trn_data):
#     # For each digits training class
#     for i in range(10):
#         w_i = w[i]
#         trn_data_i = trn_data[i]
        
#         for n in range(epochs):
#             eta = learning_rate(n+1)
#             cur_digit = trn_data_i[n]
#             if cur_digit.feature @ w_i + bias <= 0:
#                 w = w + eta * cur_digit.feature_vector
    
#     epochs = 0
#     eta = learning_rate
    

In [500]:
# np.random.rand(10,)