In [39]:
import scipy as sp 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
import tensorflow as tf 

In [15]:
(trn_images, trn_labels), (tst_images, tst_labels) = tf.keras.datasets.mnist.load_data()

In [16]:
def plot_image(image):
    plt.imshow(image, cmap='gray')
    plt.axis('off')
    plt.show()


In [17]:
# Preprocessing

# Reshape the images to 1D
trn_images = trn_images.flatten().reshape(60000, 784)
tst_images = tst_images.flatten().reshape(10000, 784)

# Normalize the images
trn_images = trn_images / 255.0
tst_images = tst_images / 255.0


In [19]:
def pca(X, k):
    X = X - np.mean(X, axis=0)              # Center data by subtracting mean of each feature
    cov = np.cov(X, rowvar=False)           # Compute the covariance matrix
    eigvals, eigvecs = np.linalg.eigh(cov)  # Compute the eigenvectors & eigenvalues of covariance matrix

    # Sort the eigenvectors by decreasing eigenvalues
    idx = np.argsort(eigvals)[::-1]
    eigvecs = eigvecs[:, idx]

    # Select the top k eigenvectors
    eigvecs = eigvecs[:, :k]

    # Project the data onto the eigenvectors
    X_pca = np.dot(X, eigvecs)
    return X_pca

In [None]:
# Different distance metrics
def eucledian_distance(x1, x2):
    return np.linalg.norm(x1 - x2, axis=1)

def manhatten_distance(x1, x2):
    return np.sum(np.abs(x1 - x2), axis=1)

def cosine_similarity(x1, x2):
    return np.dot(x1, x2.T) / (np.linalg.norm(x1) * np.linalg.norm(x2, axis=1))

def hamming_distance(x1, x2):
    return np.sum(x1 != x2, axis=1)

In [None]:
def knn(x, k, trn_images, trn_labels, dist_fn):
    distances = dist_fn(trn_images, x)
    idx = np.argsort(distances)
    k_labels = trn_labels[idx[:k]]
    counts = np.bincount(k_labels)
    return np.argmax(counts)

In [9]:
y_pred = np.zeros(len(tst_images))

for i in range(len(tst_images)):
    y_pred[i] = knn(tst_images[i], trn_images, trn_labels, 3)

In [10]:
accuracy = np.sum(y_pred == tst_labels) / len(tst_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 97.05%


In [41]:
from sklearn.decomposition import PCA

pca = PCA(n_components=.95)
pca.fit(trn_images)

print(f'Total number of components used after PCA : {pca.n_components_}')

train_img = pca.transform(trn_images)
test_img = pca.transform(tst_images)

print(f'train_img shape : {train_img.shape}')
print(f'test_img shape : {test_img.shape}')


Total number of components used after PCA : 154
train_img shape : (60000, 154)
test_img shape : (10000, 154)


In [None]:
y_pred = np.zeros(len(tst_images))

for i in range(len(tst_images)):
    y_pred[i] = knn(tst_images[i], trn_images, trn_labels, 3)

In [None]:
accuracy = np.sum(y_pred_pca == tst_labels) / len(tst_labels)
print(f"PCA k-NN Accuracy: {accuracy * 100:.2f}%")

PCA k-NN Accuracy: 5.05%
