In [19]:
import numpy as np
import pandas as pd
from MnistDataloader import MnistDataloader
from oneNNClassifier import oneNNClassifier
from utilities import random_sample
from os.path  import join
import timeit
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
import json
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
input_path = './dataset/'
training_images_filepath = join(input_path, 'train-images-idx3-ubyte')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte')

In [3]:
mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()
x_train = np.array([np.hstack(x).astype(np.float32) for x in x_train])
x_test = np.array([np.hstack(x).astype(np.float32) for x in x_test])
y_train = np.array(y_train, np.float32)
y_test = np.array(y_test, np.float32)

In [4]:
type(x_train[0][1]), type(y_train[0])

(numpy.float32, numpy.float32)

In [5]:
len(x_train), len(y_train), len(x_test), len(y_test), len(x_train[0]), y_train[0]

(60000, 60000, 10000, 10000, 784, 5.0)

In [6]:
def perform_pca(X, n_components=50):
    pca = PCA(n_components=n_components, random_state=42)
    X_reduced = pca.fit_transform(X)
    return X_reduced, pca

# Selects k prototypes from X_reduced using the farthest-point (k-center) clustering approach. 
# Returns the indices of selected prototypes in the original array.
def farthest_point_clustering(X_reduced, k):
    n_samples = X_reduced.shape[0]
    
    # Pick a random point as the first center
    first_index = np.random.randint(n_samples)
    selected_indices = [first_index]
    
    # Distances[i] will store the distance of point i from the nearest selected center
    distances = np.full(n_samples, np.inf)
    
    # Update initial distances
    distances = np.linalg.norm(X_reduced - X_reduced[first_index], axis=1)

    # Iteratively add new centers
    for _ in range(k - 1):
        # Find the point that is farthest from any selected center
        next_index = np.argmax(distances)
        selected_indices.append(next_index)

        # Update the distances to the newly added center
        new_center = X_reduced[next_index]
        dist_to_new_center = np.linalg.norm(X_reduced - new_center, axis=1)
        
        # For each point, keep the distance to the closest center
        distances = np.minimum(distances, dist_to_new_center)

    return selected_indices

# Performs PCA for dimensionality reduction.
# For each class, runs farthest-point clustering on that class's points to select prototypes.
def select_k_centers_per_class(X, y, k_per_class=10, n_components=50):

    # Reduce dimensionality of the entire dataset
    X_reduced, pca_model = perform_pca(X, n_components=n_components)
    
    unique_classes = np.unique(y)
    selected_indices = []

    for c in unique_classes:
        # Extract the indices for class c
        class_indices = np.where(y == c)[0]
        X_c_reduced = X_reduced[class_indices]

        # Run farthest-point clustering in the reduced space
        k_center_indices = farthest_point_clustering(X_c_reduced, k_per_class)

        # Map back to overall dataset indices
        selected_indices_c = class_indices[k_center_indices]
        selected_indices.extend(selected_indices_c)
    
    return selected_indices

In [None]:
# train set is sampled using M/10 prototypes per class

# sample_sizes = [10, 20, 30, 40, 50]
reduced_dims = [3, 4, 8, 16, 64, 128, 256, 512, 784]
sample_sizes = [100, 500, 1000, 2000, 5000, 10000]
storage = {} 
execution_data = {d:[] for d in reduced_dims}

for dim in reduced_dims:
    for M in sample_sizes:
        # Sample prototypes
        selected_idxs = select_k_centers_per_class(x_train, y_train, k_per_class= int(M/10), n_components=dim)
        x_sample, y_sample = x_train[selected_idxs], y_train[selected_idxs]

        # Model
        model = oneNNClassifier(x_sample, y_sample)
        
        # Timing
        elapsed_time = timeit.timeit(lambda: model.predict(x_test, size=M, storage=storage, weighted=False), 
                                number=1)
        # Accuracy
        accuracy = accuracy_score(y_test, storage[M])

        print(f"Sample size: {M}, Accuracy: {accuracy:.2f}, Execution time: {elapsed_time:.4f} seconds")
        # Store Data
        execution_data[dim].append({"sample_size": M, "time": elapsed_time, "accuracy": accuracy})

Sample size: 100, Accuracy: 0.51, Execution time: 1.5603 seconds
Sample size: 500, Accuracy: 0.78, Execution time: 7.1632 seconds
Sample size: 1000, Accuracy: 0.84, Execution time: 14.2292 seconds
Sample size: 2000, Accuracy: 0.88, Execution time: 28.4232 seconds
Sample size: 5000, Accuracy: 0.92, Execution time: 70.8696 seconds
Sample size: 10000, Accuracy: 0.94, Execution time: 141.1148 seconds
Sample size: 100, Accuracy: 0.50, Execution time: 1.5245 seconds
Sample size: 500, Accuracy: 0.74, Execution time: 7.1687 seconds
Sample size: 1000, Accuracy: 0.81, Execution time: 14.1669 seconds
Sample size: 2000, Accuracy: 0.87, Execution time: 28.1235 seconds
Sample size: 5000, Accuracy: 0.91, Execution time: 70.3498 seconds
Sample size: 10000, Accuracy: 0.94, Execution time: 141.0023 seconds
Sample size: 100, Accuracy: 0.46, Execution time: 1.5222 seconds
Sample size: 500, Accuracy: 0.66, Execution time: 7.1536 seconds
Sample size: 1000, Accuracy: 0.76, Execution time: 14.0480 seconds
Sam

In [13]:
with open("execution_data_farthest_point_clustering.json", "w") as file:
    json.dump(execution_data, file, indent=4)

In [15]:
standard = StandardScaler()
x_train_scaled = standard.fit_transform(x_train)

In [20]:
minmax = MinMaxScaler()
x_train_scaled = minmax.fit_transform(x_train)

In [21]:
x_train_scaled

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [22]:
# train set is sampled using M/10 prototypes per class

# sample_sizes = [10, 20, 30, 40, 50]
reduced_dims = [3, 4, 8, 16, 64, 128, 256, 512, 784]
sample_sizes = [100, 500, 1000, 2000, 5000, 10000]
storage = {} 
execution_data = {d:[] for d in reduced_dims}

for dim in reduced_dims:
    print('Reduction Dimension:', dim)
    for M in sample_sizes:
        # Sample Prototypes
        selected_idxs = select_k_centers_per_class(x_train_scaled, y_train, k_per_class= int(M/10), n_components=dim)
        x_sample, y_sample = x_train_scaled[selected_idxs], y_train[selected_idxs]

        # Model
        model = oneNNClassifier(x_sample, y_sample)

        # Timing
        elapsed_time = timeit.timeit(lambda: model.predict(x_test, size=M, storage=storage, weighted=False), 
                                number=1)
        # Accuracy
        accuracy = accuracy_score(y_test, storage[M])

        print(f"\tSample size: {M}, Accuracy: {accuracy:.2f}, Execution time: {elapsed_time:.4f} seconds")
        # Store Data
        execution_data[dim].append({"sample_size": M, "time": elapsed_time, "accuracy": accuracy})

Reduction Dimension: 3
	Sample size: 100, Accuracy: 0.45, Execution time: 1.7318 seconds
	Sample size: 500, Accuracy: 0.63, Execution time: 7.8626 seconds
	Sample size: 1000, Accuracy: 0.64, Execution time: 16.0565 seconds
	Sample size: 2000, Accuracy: 0.58, Execution time: 32.7266 seconds
	Sample size: 5000, Accuracy: 0.64, Execution time: 88.2672 seconds
	Sample size: 10000, Accuracy: 0.68, Execution time: 173.6080 seconds
Reduction Dimension: 4
	Sample size: 100, Accuracy: 0.54, Execution time: 1.8358 seconds
	Sample size: 500, Accuracy: 0.64, Execution time: 8.3858 seconds
	Sample size: 1000, Accuracy: 0.63, Execution time: 16.7313 seconds
	Sample size: 2000, Accuracy: 0.57, Execution time: 33.4350 seconds
	Sample size: 5000, Accuracy: 0.70, Execution time: 83.6004 seconds
	Sample size: 10000, Accuracy: 0.65, Execution time: 172.7699 seconds
Reduction Dimension: 8
	Sample size: 100, Accuracy: 0.37, Execution time: 1.7950 seconds
	Sample size: 500, Accuracy: 0.44, Execution time: 8.

In [None]:
with open("execution_data_farthest_point_clustering_scaled.json", "w") as file:
    json.dump(execution_data, file, indent=4)