# Image Classification Using CIFAR-10 Dataset: L1 vs L2 Distance with 5-Fold Cross-Validation


In [None]:
!pip install opencv-python-headless scikit-learn matplotlib

In [None]:

from google.colab import drive
drive.mount('/content/drive')


In [None]:

import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier


In [None]:

IMG_SIZE = 32
DATASET_PATH = '/content/drive/MyDrive/cifar10_animals'  # Change if needed

def load_images(dataset_path):
    data, labels = [], []
    class_names = ['cat', 'dog', 'panda']
    
    for label, category in enumerate(class_names):
        folder = os.path.join(dataset_path, category)
        for file in os.listdir(folder)[:1000]:
            img_path = os.path.join(folder, file)
            img = cv2.imread(img_path)
            if img is not None:
                gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                resized = cv2.resize(gray, (IMG_SIZE, IMG_SIZE))
                data.append(resized.flatten())
                labels.append(label)
    
    return np.array(data), np.array(labels), class_names

X, y, class_names = load_images(DATASET_PATH)
print(f'Dataset shape: {X.shape}, Labels: {y.shape}')


In [None]:

def show_samples(X, y):
    plt.figure(figsize=(10, 4))
    for i in range(6):
        plt.subplot(2, 3, i + 1)
        plt.imshow(X[i].reshape(IMG_SIZE, IMG_SIZE), cmap='gray')
        plt.title(class_names[y[i]])
        plt.axis('off')
    plt.tight_layout()
    plt.show()

show_samples(X, y)


In [None]:

def evaluate_knn(X, y, distance_metric):
    k_vals = list(range(1, 11))
    avg_scores = []
    for k in k_vals:
        knn = KNeighborsClassifier(n_neighbors=k, metric=distance_metric)
        scores = cross_val_score(knn, X, y, cv=5)
        avg_scores.append(np.mean(scores))
    return k_vals, avg_scores

k_vals, acc_l1 = evaluate_knn(X, y, 'manhattan')
_, acc_l2 = evaluate_knn(X, y, 'euclidean')


In [None]:

plt.figure(figsize=(10, 6))
plt.plot(k_vals, acc_l1, marker='o', label='Manhattan (L1)')
plt.plot(k_vals, acc_l2, marker='s', label='Euclidean (L2)')
plt.xlabel('K Value')
plt.ylabel('Accuracy')
plt.title('Accuracy vs K (5-Fold Cross-Validation)')
plt.legend()
plt.grid(True)
plt.show()


In [None]:

knn = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
knn.fit(X, y)
test_images = X[:5]
true_labels = y[:5]
predicted = knn.predict(test_images)

for i in range(5):
    plt.imshow(test_images[i].reshape(IMG_SIZE, IMG_SIZE), cmap='gray')
    plt.title(f"True: {class_names[true_labels[i]]} | Pred: {class_names[predicted[i]]}")
    plt.axis('off')
    plt.show()



## Discussion

The Euclidean (L2) distance generally gave better average accuracy than Manhattan (L1), especially for K values between 3–5. This suggests L2 may be better at capturing subtle image differences. 

The model’s performance may improve with:
- Feature scaling
- Dimensionality reduction (e.g., PCA)
- Using CNNs for feature extraction

K values that are too high can reduce sensitivity to local patterns.
