In [52]:
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import mode

In [53]:
def load_mnist_data(input_file):
    data, meta = arff.loadarff(input_file)
    df = pd.DataFrame(data)
    df["class"] = df.apply(lambda x: x["class"].decode("utf-8"), axis=1)
    for col in df.columns[:-1]:
        df[col] = df[col].apply(lambda x: x / 255.0)
    return df[df.columns[:-1]].to_numpy(), df["class"].to_numpy()

In [54]:
# import numpy as np

# def pairwise_distance(mat):
#     n = len(mat)
#     dmat = np.zeros((n, n))
#     for i in range(n):
#         for j in range(i, n):
#             if j != i:
#                 dmat[i][j] = np.linalg.norm(mat[i] - mat[j])
#                 dmat[j][i] = dmat[i][j]  # Since the distance matrix is symmetric
#     return dmat


def euclidean(vec1, vec2):
    return np.sqrt(np.dot(vec1, vec1) - 2 * np.dot(vec1, vec2) + np.dot(vec2, vec2))

def pairwise_distance(mat):
    n = len(mat)
    dmat = np.zeros((n, n))
    for i in range(n):
        j = 0
        while j <= i:
            if j != i:
                dmat[i][j] = euclidean(mat[i], mat[j])
            j += 1
    return dmat + dmat.T

In [55]:
X, y = load_mnist_data("./datasets/mnist_784.arff")

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X[:20000], y[:20000], test_size=0.2, stratify=y[:20000], random_state=42)
X_train.shape

(16000, 784)

In [45]:
pairwise_distance(X[:5000])

array([[ 0.        ,  9.36122213, 10.87509456, ..., 10.15739646,
         9.3411419 , 10.54320344],
       [ 9.36122213,  0.        , 11.36836434, ..., 10.64657927,
        10.08647272, 11.48242326],
       [10.87509456, 11.36836434,  0.        , ..., 11.09412458,
         8.82878101, 10.4646212 ],
       ...,
       [10.15739646, 10.64657927, 11.09412458, ...,  0.        ,
        10.25524087,  8.97626586],
       [ 9.3411419 , 10.08647272,  8.82878101, ..., 10.25524087,
         0.        ,  8.78094035],
       [10.54320344, 11.48242326, 10.4646212 , ...,  8.97626586,
         8.78094035,  0.        ]])

In [57]:
# Computing pairwise distance
from sklearn.metrics.pairwise import euclidean_distances
dist_mat = euclidean_distances(X_train)
dist_mat

array([[ 0.        ,  9.9291353 , 11.38291144, ..., 10.01201508,
        10.55961855,  9.92242343],
       [ 9.9291353 ,  0.        ,  9.61035741, ...,  9.20687104,
        11.66160642,  8.49849796],
       [11.38291144,  9.61035741,  0.        , ..., 10.0875186 ,
        12.84913017,  9.2051003 ],
       ...,
       [10.01201508,  9.20687104, 10.0875186 , ...,  0.        ,
        11.16396066,  9.55097697],
       [10.55961855, 11.66160642, 12.84913017, ..., 11.16396066,
         0.        , 11.59539011],
       [ 9.92242343,  8.49849796,  9.2051003 , ...,  9.55097697,
        11.59539011,  0.        ]])

In [51]:
from sklearn.metrics import pairwise_distances
from scipy.stats import mode
import numpy as np

class KNNClassifier:
    def __init__(self, k):
        self.k = k
        self.train_x = None
        self.train_y = None

    def fit(self, X, y):
        self.train_x = X
        self.train_y = y

    def score(self, X, y):
        # Calculate pairwise distances between test and training data
        distances = euclidean_distances(X, self.train_x)
        
        count = 0
        for i in range(len(X)):
            # Sort distances and get indices of the k nearest neighbors
            indices = np.argsort(distances[i])[:self.k]
            
            # Get labels of the k nearest neighbors
            k_labels = self.train_y[indices]
            
            # Ensure k_labels are numeric
            k_labels_numeric = np.asarray(k_labels, dtype=float)

            # Calculate the mode of k_labels
            mode_result = mode(k_labels_numeric)

            # Check if the mode_result is not empty and is an array
            if isinstance(mode_result.mode, np.ndarray) and mode_result.mode.size > 0:
                # Predict the class label based on the mode
                pred = mode_result.mode[0]
            else:
                # Handle cases where mode is not as expected
                pred = -1  # An invalid class label indicating an issue

            # True label for the current test data point
            true_y = y[i]
            
            # Check if the predicted label matches the true label
            if pred == true_y:
                count += 1

        # Calculate accuracy as a percentage
        accuracy = (count / len(X)) * 100
        return accuracy

# Example usage
# knn_clf = KNNClassifier(k=10)
# knn_clf.fit(X_train, y_train)
# accuracy = knn_clf.score(X_test, y_test)
# print("Accuracy:", accuracy)
knn_clf = KNNClassifier(k=10)
knn_clf.fit(X_train, y_train)
knn_clf.score(X_test, y_test)

0.0

In [60]:
class KNNClassifier:
    def __init__(self, k):
        self.k = k
        self.train_x = None
        self.train_y = None

    def fit(self, X, y):
        self.train_x = X
        self.train_y = y
        
    def score(self, X, y):
    # Calculate pairwise distances between test and training data
        distances = pairwise_distances(X, self.train_x)

        count = 0
        for i in range(len(X)):
            # Sort distances and get indices of the k nearest neighbors
            indices = np.argsort(distances[i])[:self.k]

            # Get labels of the k nearest neighbors
            k_labels = self.train_y[indices]

            # Ensure k_labels are numeric
            k_labels_numeric = np.asarray(k_labels, dtype=float)

            # Calculate the mode of k_labels
            mode_result = mode(k_labels_numeric)

            # Extract the most common label
            if isinstance(mode_result.mode, np.ndarray) and mode_result.mode.size > 0:
                pred = mode_result.mode[0]
            else:
                # Handle cases where mode is not as expected
                pred = -1  # An invalid class label or a default value

            # True label for the current test data point
            true_y = y[i]

            # Check if the predicted label matches the true label
            if pred == true_y:
                count += 1

        # Calculate accuracy as a percentage
        accuracy = (count / len(X)) * 100
        return accuracy


In [61]:
knn_clf = KNNClassifier(k=10)
knn_clf.fit(X_train, y_train)
knn_clf.score(X_test, y_test)



IndexError: invalid index to scalar variable.