First, I have to download and preprocess the dataset. CIFAR-10 is a well-known image classification dataset that contains 60,000 32x32 color images in 10 classes, with 6,000 images per class.

In [1]:
import numpy as np
import keras
from keras.datasets import cifar10
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# Download and preprocess the CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = x_train.astype('float32') / 255
x_test = x_test.astype('float32') / 255
y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)

# Flatten the images
x_train = x_train.reshape((50000, -1))
x_test = x_test.reshape((10000, -1))


Next, we can perform k-fold cross-validation to estimate the performance of the k-NN models with different distance matrices.

In [2]:
# Perform k-fold cross-validation with Manhattan distance matrix on the CIFAR-10 dataset
knn_manhattan = KNeighborsClassifier(n_neighbors=5, p=1)
cv_scores_manhattan = cross_val_score(knn_manhattan, x_train, np.argmax(y_train, axis=1), cv=5)

# Perform k-fold cross-validation with Euclidean distance matrix on the CIFAR-10 dataset
knn_euclidean = KNeighborsClassifier(n_neighbors=5, p=2)
cv_scores_euclidean = cross_val_score(knn_euclidean, x_train, np.argmax(y_train, axis=1), cv=5)


After cross-validation, we can compute the mean and standard deviation of the cross-validation scores.

In [3]:
# Compute the mean and standard deviation of the cross-validation scores
print('Cross-validation scores (Manhattan distance matrix):', cv_scores_manhattan)
print('Mean cross-validation score (Manhattan distance matrix):', np.mean(cv_scores_manhattan))
print('Standard deviation of the cross-validation scores (Manhattan distance matrix):', np.std(cv_scores_manhattan))
print('Cross-validation scores (Euclidean distance matrix):', cv_scores_euclidean)
print('Mean cross-validation score (Euclidean distance matrix):', np.mean(cv_scores_euclidean))
print('Standard deviation of the cross-validation scores (Euclidean distance matrix):', np.std(cv_scores_euclidean))


Cross-validation scores (Manhattan distance matrix): [0.3737 0.3612 0.3695 0.3581 0.3575]
Mean cross-validation score (Manhattan distance matrix): 0.364
Standard deviation of the cross-validation scores (Manhattan distance matrix): 0.006469003014375552
Cross-validation scores (Euclidean distance matrix): [0.3326 0.3286 0.337  0.3317 0.3304]
Mean cross-validation score (Euclidean distance matrix): 0.33205999999999997
Standard deviation of the cross-validation scores (Euclidean distance matrix): 0.0028125433329995154


Finally, we can plot the cross-validation scores and cross-accuracy of the k-NN models as a function of the number of neighbors. It also includes the values of k for the best cross-validation scores.

In [None]:
import matplotlib.pyplot as plt

# Plot the cross-validation scores and cross-accuracy of the k-NN models as a function of the number of neighbors
cv_scores_manhattan = []
cv_scores_euclidean = []
accuracies_manhattan = []
accuracies_euclidean = []
for k in range(1, 11):
    knn_manhattan = KNeighborsClassifier(n_neighbors=k, p=1)
    knn_euclidean = KNeighborsClassifier(n_neighbors=k, p=2)
    try:
        cv_score_manhattan = np.mean(cross_val_score(knn_manhattan, x_train, np.argmax(y_train, axis=1), cv=5))
    except:
        cv_score_manhattan = 0
    try:
        cv_score_euclidean = np.mean(cross_val_score(knn_euclidean, x_train, np.argmax(y_train, axis=1), cv=5))
    except:
        cv_score_euclidean = 0
    cv_scores_manhattan.append(cv_score_manhattan)
    cv_scores_euclidean.append(cv_score_euclidean)
    knn_manhattan.fit(x_train, np.argmax(y_train, axis=1))
    knn_euclidean.fit(x_train, np.argmax(y_train, axis=1))
    y_pred_manhattan = knn_manhattan.predict(x_test)
    y_pred_euclidean = knn_euclidean.predict(x_test)
    accuracy_manhattan = np.mean(np.argmax(y_test, axis=1) == y_pred_manhattan)
    accuracy_euclidean = np.mean(np.argmax(y_test, axis=1) == y_pred_euclidean)
    accuracies_manhattan.append(accuracy_manhattan)
    accuracies_euclidean.append(accuracy_euclidean)

best_k_manhattan = np.argmax(cv_scores_manhattan) + 1
best_k_euclidean = np.argmax(cv_scores_euclidean) + 1

plt.plot(range(1, 11), cv_scores_manhattan, label='Cross-validation (Manhattan distance matrix)')
plt.plot(range(1, 11), cv_scores_euclidean, label='Cross-validation (Euclidean distance matrix)')
plt.plot(range(1, 11), accuracies_manhattan, label='Cross-accuracy (Manhattan distance matrix)')
plt.plot(range(1, 11), accuracies_euclidean, label='Cross-accuracy (Euclidean distance matrix)')
plt.title('Performance of k-NN on CIFAR-10 dataset')
plt.xlabel('Number of neighbors (k)')
plt.ylabel('Score')
plt.legend()
plt.show()

print(f"Best k for Manhattan distance matrix: {best_k_manhattan}")
print(f"Best k for Euclidean distance matrix: {best_k_euclidean}")



This code prints out the values of k that correspond to the best cross-validation scores for the Manhattan and Euclidean distance matrices. These values are obtained by finding the index of the maximum cross-validation score in the arrays cv_scores_manhattan and cv_scores_euclidean, and adding 1 to obtain the corresponding value of k.