In [52]:
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import mode

In [53]:
def load_mnist_data(input_file):
    data, meta = arff.loadarff(input_file)
    df = pd.DataFrame(data)
    df["class"] = df.apply(lambda x: x["class"].decode("utf-8"), axis=1)
    for col in df.columns[:-1]:
        df[col] = df[col].apply(lambda x: x / 255.0)
    return df[df.columns[:-1]].to_numpy(), df["class"].to_numpy()

In [54]:
# import numpy as np

# def pairwise_distance(mat):
#     n = len(mat)
#     dmat = np.zeros((n, n))
#     for i in range(n):
#         for j in range(i, n):
#             if j != i:
#                 dmat[i][j] = np.linalg.norm(mat[i] - mat[j])
#                 dmat[j][i] = dmat[i][j]  # Since the distance matrix is symmetric
#     return dmat


def euclidean(vec1, vec2):
    return np.sqrt(np.dot(vec1, vec1) - 2 * np.dot(vec1, vec2) + np.dot(vec2, vec2))

def pairwise_distance(mat):
    n = len(mat)
    dmat = np.zeros((n, n))
    for i in range(n):
        j = 0
        while j <= i:
            if j != i:
                dmat[i][j] = euclidean(mat[i], mat[j])
            j += 1
    return dmat + dmat.T

In [55]:
X, y = load_mnist_data("./datasets/mnist_784.arff")

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X[:20000], y[:20000], test_size=0.2, stratify=y[:20000], random_state=42)
X_train.shape

(16000, 784)

In [45]:
pairwise_distance(X[:5000])

array([[ 0.        ,  9.36122213, 10.87509456, ..., 10.15739646,
         9.3411419 , 10.54320344],
       [ 9.36122213,  0.        , 11.36836434, ..., 10.64657927,
        10.08647272, 11.48242326],
       [10.87509456, 11.36836434,  0.        , ..., 11.09412458,
         8.82878101, 10.4646212 ],
       ...,
       [10.15739646, 10.64657927, 11.09412458, ...,  0.        ,
        10.25524087,  8.97626586],
       [ 9.3411419 , 10.08647272,  8.82878101, ..., 10.25524087,
         0.        ,  8.78094035],
       [10.54320344, 11.48242326, 10.4646212 , ...,  8.97626586,
         8.78094035,  0.        ]])

In [57]:
# Computing pairwise distance
from sklearn.metrics.pairwise import euclidean_distances
dist_mat = euclidean_distances(X_train)
dist_mat

array([[ 0.        ,  9.9291353 , 11.38291144, ..., 10.01201508,
        10.55961855,  9.92242343],
       [ 9.9291353 ,  0.        ,  9.61035741, ...,  9.20687104,
        11.66160642,  8.49849796],
       [11.38291144,  9.61035741,  0.        , ..., 10.0875186 ,
        12.84913017,  9.2051003 ],
       ...,
       [10.01201508,  9.20687104, 10.0875186 , ...,  0.        ,
        11.16396066,  9.55097697],
       [10.55961855, 11.66160642, 12.84913017, ..., 11.16396066,
         0.        , 11.59539011],
       [ 9.92242343,  8.49849796,  9.2051003 , ...,  9.55097697,
        11.59539011,  0.        ]])

In [73]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Load MNIST dataset
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist["data"], mnist["target"]

# Convert labels to integers
y = y.astype(int)  # Corrected line

# Normalize the pixel values
X = X / 255.0

# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the KNN classifier with k=10
knn_clf = KNeighborsClassifier(n_neighbors=10)

# Fit the model
knn_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_clf.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9657857142857142
