In [1]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import accuracy_score
import time
from joblib import Parallel, delayed

# Load MNIST dataset
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist["data"], mnist["target"].astype(int)  # Convert labels to integers


In [2]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000, random_state=42)

# Normalize pixel values (ensure they are floats)
X_train = X_train.astype(np.float64) / 255.0
X_test = X_test.astype(np.float64) / 255.0

# Convert DataFrames to NumPy arrays
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values  # Convert y_train to a NumPy array

# Check data types
print(f"X_train dtype: {X_train.dtype}")
print(f"X_test dtype: {X_test.dtype}")
print(f"y_train dtype: {y_train.dtype}")

# Check for non-numeric values
print(f"NaN values in X_train: {np.any(np.isnan(X_train))}")
print(f"NaN values in X_test: {np.any(np.isnan(X_test))}")
print(f"Infinite values in X_train: {np.any(np.isinf(X_train))}")
print(f"Infinite values in X_test: {np.any(np.isinf(X_test))}")

# Parallel KNN implementation
def knn_predict_parallel(X_train, y_train, X_test, k=3, n_jobs=-1):
    def predict_single(test_point):
        distances = np.sqrt(np.sum((X_train - test_point) ** 2, axis=1))  # Euclidean distance
        nearest_indices = np.argsort(distances)[:k]  # Find k nearest neighbors
        nearest_labels = y_train[nearest_indices]  # Use positional indexing
        majority_vote = Counter(nearest_labels).most_common(1)[0][0]  # Majority voting
        return majority_vote

    predictions = Parallel(n_jobs=n_jobs)(delayed(predict_single)(test_point) for test_point in X_test)
    return np.array(predictions)

# Start the timer
start_time = time.time()

# Predict and evaluate
y_pred_parallel = knn_predict_parallel(X_train, y_train, X_test, k=3, n_jobs=-1)
accuracy_parallel = accuracy_score(y_test, y_pred_parallel)

# Stop the timer
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time_parallel = end_time - start_time
print(f"Parallel KNN Accuracy: {accuracy_parallel * 100:.2f}%")
print(f"Time taken for prediction: {elapsed_time_parallel:.4f} seconds")

X_train dtype: float64
X_test dtype: float64
y_train dtype: int32
NaN values in X_train: False
NaN values in X_test: False
Infinite values in X_train: False
Infinite values in X_test: False
Parallel KNN Accuracy: 98.00%
Time taken for prediction: 130.6184 seconds
