In [1]:
import numpy as np
import pandas as pd
from MnistDataloader import MnistDataloader
from oneNNClassifier import oneNNClassifier
from utilities import random_sample
from os.path  import join
import timeit
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
import json

In [2]:
input_path = './dataset/'
training_images_filepath = join(input_path, 'train-images-idx3-ubyte')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte')

In [3]:
mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()
x_train = [np.hstack(x).astype(np.float32) for x in x_train]
x_test = [np.hstack(x).astype(np.float32) for x in x_test]
y_train = np.array(y_train, np.float32)
y_test = np.array(y_test, np.float32)

In [4]:
type(x_train[0][1]), type(y_train[0])

(numpy.float32, numpy.float32)

In [5]:
len(x_train), len(y_train), len(x_test), len(y_test), len(x_train[0]), y_train[0]

(60000, 60000, 10000, 10000, 784, 5.0)

In [6]:
df_train = pd.concat([pd.DataFrame(x_train), pd.DataFrame(np.reshape(y_train, (-1, 1)), columns=['label'])], axis = 1)
df_test = pd.concat([pd.DataFrame(x_test), pd.DataFrame(np.reshape(y_test, (-1, 1)), columns=['label'])], axis = 1)

In [7]:
# df_train, df_test

In [8]:
df_train.groupby('label').count()[0]

label
0.0    5923
1.0    6742
2.0    5958
3.0    6131
4.0    5842
5.0    5421
6.0    5918
7.0    6265
8.0    5851
9.0    5949
Name: 0, dtype: int64

In [9]:
grouped_mean = df_train.groupby('label').mean()
grouped_mean

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.765682,0.672785,0.258101,0.105188,0.112211,0.019154,0.0,0.0,0.0,0.0
8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.162044,0.187763,0.188435,0.083712,0.034291,0.0,0.0,0.0,0.0,0.0


In [10]:
df_train['similarity'] = df_train.apply(
    lambda row: cosine_similarity(
        row.iloc[:784].to_numpy().reshape(1, -1),  # First 784 columns as a 2D array
        grouped_mean.loc[row.iloc[784]].to_numpy().reshape(1, -1)  # Mean vector for the class
    )[0, 0],  # Extract the scalar similarity value
    axis=1
)

In [11]:
# Retain top N=1000 rows per group using nlargest
def sample_k_means(M, weighted=True):
    N = int(M/10)
    df_top_n = df_train.groupby('label', group_keys=False)[df_train.columns].apply(lambda group: group.nlargest(N, 'similarity'))
    X = df_top_n.iloc[:, :784].to_numpy()
    y = df_top_n.iloc[:, 784].to_numpy()
    wei = df_top_n.iloc[:, 785].to_numpy() if weighted else None
    return X, y, wei

x_sample, y_sample, wei = sample_k_means(10000, weighted=True)

In [12]:
wei

array([0.95571005, 0.9555477 , 0.9555182 , ..., 0.81702214, 0.8170178 ,
       0.816975  ], dtype=float32)

In [13]:
# train set is sampled using M/10 closest representations of the mean vector

# sample_sizes = [10, 20, 30, 40, 50]
sample_sizes = [100, 500, 1000, 2000, 5000, 10000]
storage = {} 
execution_data = []

for M in sample_sizes:
    x_sample, y_sample, wei = sample_k_means(M, weighted=False)

    model = oneNNClassifier(x_sample, y_sample, weights=wei)
    elapsed_time = timeit.timeit(lambda: model.predict(x_test, size=M, storage=storage, weighted=False), 
                            number=1)
    accuracy = accuracy_score(y_test, storage[M])

    print(f"Sample size: {M}, Accuracy: {accuracy:.2f}, Execution time: {elapsed_time:.4f} seconds")
    execution_data.append({"sample_size": M, "time": elapsed_time, "accuracy": accuracy})

Sample size: 100, Accuracy: 0.69, Execution time: 2.3682 seconds
Sample size: 500, Accuracy: 0.76, Execution time: 11.8547 seconds
Sample size: 1000, Accuracy: 0.78, Execution time: 23.2156 seconds
Sample size: 2000, Accuracy: 0.81, Execution time: 53.0565 seconds
Sample size: 5000, Accuracy: 0.85, Execution time: 138.8276 seconds
Sample size: 10000, Accuracy: 0.88, Execution time: 271.3961 seconds


In [14]:
with open("execution_data_vector_mean.json", "w") as file:
    json.dump(execution_data, file, indent=4)

In [15]:
# train set is sampled using M/10 closest representations of the mean vector with distance inversely proportional to similarity

# sample_sizes = [10, 20, 30, 40, 50]
sample_sizes = [100, 500, 1000, 2000, 5000, 10000]
storage = {} 
execution_data = []

for M in sample_sizes:
    x_sample, y_sample, wei = sample_k_means(M, weighted=True)

    model = oneNNClassifier(x_sample, y_sample, weights=wei)
    elapsed_time = timeit.timeit(lambda: model.predict(x_test, size=M, storage=storage, weighted=True), 
                            number=1)
    accuracy = accuracy_score(y_test, storage[M])

    print(f"Sample size: {M}, Accuracy: {accuracy:.2f}, Execution time: {elapsed_time:.4f} seconds")
    execution_data.append({"sample_size": M, "time": elapsed_time, "accuracy": accuracy})

Sample size: 100, Accuracy: 0.69, Execution time: 2.6769 seconds
Sample size: 500, Accuracy: 0.76, Execution time: 15.3395 seconds
Sample size: 1000, Accuracy: 0.78, Execution time: 27.5810 seconds
Sample size: 2000, Accuracy: 0.80, Execution time: 53.2296 seconds
Sample size: 5000, Accuracy: 0.83, Execution time: 141.1817 seconds
Sample size: 10000, Accuracy: 0.86, Execution time: 290.0819 seconds


In [16]:
with open("execution_data_vector_mean_weighted.json", "w") as file:
    json.dump(execution_data, file, indent=4)