In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Assuming the MNIST CSV file is in the 'data' folder within your project
mnist_df = pd.read_csv('data/mnist_train.csv')


In [14]:
# Extract features and labels
X = mnist_df.iloc[:, 1:].values  # All pixel values
y = mnist_df.iloc[:, 0].values   # Labels

# Reshape the features (images) into 2D array with 784 columns
X_reshaped = X.reshape(X.shape[0], 28 * 28)

# Select only 1,000 samples from the dataset for simplicity
X_sample = X_reshaped[:1000]
y_sample = y[:1000]

In [15]:
# Manually split the dataset into 90% train and 10% validation
split_index = int(0.9 * X_sample.shape[0])
X_train, y_train = X_sample[:split_index], y_sample[:split_index]
X_val, y_val = X_sample[split_index:], y_sample[split_index:]

print(f"Training set size: {X_train.shape}, Validation set size: {X_val.shape}")

Training set size: (900, 784), Validation set size: (100, 784)


In [32]:
from collections import Counter

# Function to calculate Euclidean distance
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# KNN classifier
def simple_knn(X_train, y_train, x_test, k=3):
    distances = []
    
    # Calculate the distance from the test point to each training point
    for i in range(len(X_train)):
        distance = euclidean_distance(X_train[i], x_test)
        distances.append((distance, y_train[i]))
    
    # Sort distances by the first element (distance), and take the k nearest neighbors
    distances.sort(key=lambda x: x[0])
    neighbors = distances[:k]
    
    # Extract the labels of the nearest neighbors
    labels = [neighbor[1] for neighbor in neighbors]
    
    # Return the most common label
    most_common = Counter(labels).most_common(1)
    return most_common[0][0]

# Testing KNN with validation set
def predict_knn(X_train, y_train, X_val, k=3):
    y_pred = []
    for x_test in X_val:
        label = simple_knn(X_train, y_train, x_test, k=k)
        y_pred.append(label)
    return y_pred


In [17]:
# Make predictions on the validation set
k = 3
y_pred = predict_knn(X_train, y_train, X_val, k=k)

# Calculate accuracy
def accuracy(y_true, y_pred):
    correct = 0
    for i in range(len(y_true)):
        if y_true[i] == y_pred[i]:
            correct += 1
    return correct / len(y_true)

# Calculate and print accuracy
acc = accuracy(y_val, y_pred)
print(f"Validation Accuracy: {acc * 100:.2f}%")


Validation Accuracy: 88.00%


In [38]:
#test part
# Load the test dataset
test_mnist_df = pd.read_csv('data/mnist_test.csv')

X_test = test_mnist_df.iloc[:, 1:].values
y_test = test_mnist_df.iloc[:, 0].values[:1000]

# Reshape the test set (flatten 28x28 images into vectors of size 784)
X_test_reshaped = X_test.reshape(X_test.shape[0], 28 * 28)[:1000]

k = 4
y_test_pred = predict_knn(X_train, y_train, X_test_reshaped, k=k)

test_accuracy = accuracy(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Test Accuracy: 82.40%


In [26]:
X_train_final = np.vstack((X_train, X_val))
y_train_final = np.hstack((y_train, y_val))

print(f"Final training set size: {X_train_final.shape}")

Final training set size: (1000, 784)


In [27]:
def manhattan_distance(x1, x2):
    return np.sum(np.abs(x1 - x2))

In [34]:
# KNN classifier with an option for distance metric
def knn(X_train, y_train, x_test, k=3, distance_metric='euclidean'):
    distances = []
    
    # Calculate the distance from the test point to each training point
    for i in range(len(X_train)):
        if distance_metric == 'euclidean':
            distance = euclidean_distance(X_train[i], x_test)
        elif distance_metric == 'manhattan':
            distance = manhattan_distance(X_train[i], x_test)
        distances.append((distance, y_train[i]))
    
    # Sort distances by the first element (distance), and take the k nearest neighbors
    distances.sort(key=lambda x: x[0])
    neighbors = distances[:k]
    
    # Extract the labels of the nearest neighbors
    labels = [neighbor[1] for neighbor in neighbors]
    
    # Return the most common label
    most_common = Counter(labels).most_common(1)
    return most_common[0][0]

def predict_knn(X_train, y_train, X_val, k=3, distance_metric='euclidean'):
    y_pred = []
    for x_test in X_val:
        label = knn(X_train, y_train, x_test, k=k, distance_metric=distance_metric)
        y_pred.append(label)
    return y_pred

In [35]:
y_test_pred_euclidean = predict_knn(X_train_final, y_train_final, X_test_reshaped, k=3, distance_metric='euclidean')

y_test_pred_manhattan = predict_knn(X_train_final, y_train_final, X_test_reshaped, k=3, distance_metric='manhattan')

# Calculate accuracy for both Euclidean and Manhattan distances
accuracy_euclidean = accuracy(y_test, y_test_pred_euclidean)
accuracy_manhattan = accuracy(y_test, y_test_pred_manhattan)

print(f"Test Accuracy (Euclidean Distance): {accuracy_euclidean * 100:.2f}%")
print(f"Test Accuracy (Manhattan Distance): {accuracy_manhattan * 100:.2f}%")


Test Accuracy (Euclidean Distance): 85.00%
Test Accuracy (Manhattan Distance): 83.33%


In [36]:
# ج

# Experimenting with different values of k
for k in range(1, 11):
    y_test_pred_euclidean = predict_knn(X_train_final, y_train_final, X_test_reshaped, k=k, distance_metric='euclidean')
    y_test_pred_manhattan = predict_knn(X_train_final, y_train_final, X_test_reshaped, k=k, distance_metric='manhattan')
    
    acc_euclidean = accuracy(y_test, y_test_pred_euclidean)
    acc_manhattan = accuracy(y_test, y_test_pred_manhattan)

    # We can break the loop when then accuracy is decreasing (higher complicated model -> higher cahnce of overfit)
    
    print(f"k={k}: Euclidean Accuracy = {acc_euclidean * 100:.2f}%, Manhattan Accuracy = {acc_manhattan * 100:.2f}%")



k=1: Euclidean Accuracy = 83.67%, Manhattan Accuracy = 82.67%
k=2: Euclidean Accuracy = 83.67%, Manhattan Accuracy = 82.67%
k=3: Euclidean Accuracy = 85.00%, Manhattan Accuracy = 83.33%
k=4: Euclidean Accuracy = 86.00%, Manhattan Accuracy = 83.33%
k=5: Euclidean Accuracy = 85.33%, Manhattan Accuracy = 82.00%
k=6: Euclidean Accuracy = 85.33%, Manhattan Accuracy = 82.00%
k=7: Euclidean Accuracy = 83.33%, Manhattan Accuracy = 82.00%
k=8: Euclidean Accuracy = 84.33%, Manhattan Accuracy = 81.67%
k=9: Euclidean Accuracy = 83.00%, Manhattan Accuracy = 80.00%
k=10: Euclidean Accuracy = 83.67%, Manhattan Accuracy = 80.00%


In [40]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Separate features and labels for both train and test datasets
X_train = mnist_df.iloc[:, 1:].values  # Pixel values (features)
y_train = mnist_df.iloc[:, 0].values   # Labels

X_test = test_mnist_df.iloc[:, 1:].values    # Pixel values (features)
y_test = test_mnist_df.iloc[:, 0].values     # Labels

# Reshape the data as needed (already in 2D format, so no reshape needed)

# KNN with Euclidean distance
knn_euclidean = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
knn_euclidean.fit(X_train, y_train)

# Predict using Euclidean distance
y_test_pred_euclidean = knn_euclidean.predict(X_test)

# Calculate accuracy for Euclidean distance
accuracy_euclidean = accuracy_score(y_test, y_test_pred_euclidean)
print(f"Test Accuracy (Euclidean Distance): {accuracy_euclidean * 100:.2f}%")


# KNN with Manhattan distance
knn_manhattan = KNeighborsClassifier(n_neighbors=3, metric='manhattan')
knn_manhattan.fit(X_train, y_train)

# Predict using Manhattan distance
y_test_pred_manhattan = knn_manhattan.predict(X_test)

# Calculate accuracy for Manhattan distance
accuracy_manhattan = accuracy_score(y_test, y_test_pred_manhattan)
print(f"Test Accuracy (Manhattan Distance): {accuracy_manhattan * 100:.2f}%")


Test Accuracy (Euclidean Distance): 97.05%
Test Accuracy (Manhattan Distance): 96.33%
