In [1]:
import numpy as np
from MnistDataloader import MnistDataloader
from oneNNClassifier import oneNNClassifier
from utilities import random_sample
from os.path  import join
import timeit
from sklearn.metrics import accuracy_score
import json


In [2]:
input_path = './dataset/'
training_images_filepath = join(input_path, 'train-images-idx3-ubyte')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte')

In [3]:
mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()
x_train = [np.hstack(x).astype(np.float32) for x in x_train]
x_test = [np.hstack(x).astype(np.float32) for x in x_test]
y_train = np.array(y_train, np.float32)
y_test = np.array(y_test, np.float32)

In [4]:
type(x_train[0][1]), type(y_train[0])

(numpy.float32, numpy.float32)

In [5]:
len(x_train), len(y_train), len(x_test), len(y_test)

(60000, 60000, 10000, 10000)

In [6]:
len(x_train[0]), y_train[0]

(784, 5.0)

In [7]:
execution_data = {}
storage = {}
size = len(x_train)

In [8]:
model = oneNNClassifier(x_train, y_train)
elapsed_time = timeit.timeit(lambda: model.predict(x_test, size=size, storage=storage), 
                             number=1)
accuracy = accuracy_score(y_test, storage[size])

execution_data[size] = {"time": elapsed_time, "accuracy": accuracy}
print(execution_data)

{60000: {'time': 1287.285123041831, 'accuracy': 0.9691}}


In [9]:
with open("execution_data_baseline.json", "w") as file:
    json.dump(execution_data, file, indent=4)

In [10]:
# Uniformly weighted and Randomly sampled train data

# sample_sizes = [1, 10, 20, 30, 40, 50]
sample_sizes = [100, 500, 1000, 2000, 5000, 10000]
storage = {} 
execution_data = {}

for trial in range(5):
    execution_data[trial] = []
    print('Trial', trial, ':')
    for M in sample_sizes:
        x_sample, y_sample = random_sample(M, x_train, y_train)

        model = oneNNClassifier(x_sample, y_sample)
        elapsed_time = timeit.timeit(lambda: model.predict(x_test, size=M, storage=storage), 
                                number=1)
        accuracy = accuracy_score(y_test, storage[M])

        print(f"Sample size: {M}, Accuracy: {accuracy:.2f}, Execution time: {elapsed_time:.4f} seconds")
        execution_data[trial].append({"sample_size": M, "time": elapsed_time, "accuracy": accuracy})

Trial 0 :
Sample size: 100, Accuracy: 0.70, Execution time: 1.9583 seconds
Sample size: 500, Accuracy: 0.85, Execution time: 9.6544 seconds
Sample size: 1000, Accuracy: 0.88, Execution time: 19.2811 seconds
Sample size: 2000, Accuracy: 0.91, Execution time: 39.5965 seconds
Sample size: 5000, Accuracy: 0.93, Execution time: 107.5737 seconds
Sample size: 10000, Accuracy: 0.95, Execution time: 210.7949 seconds
Trial 1 :
Sample size: 100, Accuracy: 0.69, Execution time: 1.4940 seconds
Sample size: 500, Accuracy: 0.85, Execution time: 7.2783 seconds
Sample size: 1000, Accuracy: 0.88, Execution time: 14.4138 seconds
Sample size: 2000, Accuracy: 0.91, Execution time: 29.0696 seconds
Sample size: 5000, Accuracy: 0.94, Execution time: 78.5522 seconds
Sample size: 10000, Accuracy: 0.95, Execution time: 176.5432 seconds
Trial 2 :
Sample size: 100, Accuracy: 0.72, Execution time: 1.4962 seconds
Sample size: 500, Accuracy: 0.86, Execution time: 7.2991 seconds
Sample size: 1000, Accuracy: 0.89, Exec

In [11]:
with open("execution_data_random.json", "w") as file:
    json.dump(execution_data, file, indent=4)