In [1]:
#method 1
import os
import cv2
import numpy as np
from scipy.spatial.distance import cdist

# path
pie_path = r'D:\University\NUS\EE5907\PIE'
self_path = r'D:\University\NUS\EE5907\self'

data = []
labels = []

# load 25 subjects and self
count = 0
selected_subjects = set()
images_per_subject = 20

while count < 500:
    subject_folder = np.random.choice(os.listdir(pie_path))
    if subject_folder not in selected_subjects:
        selected_subjects.add(subject_folder)
        subject_path = os.path.join(pie_path, subject_folder)
        if os.path.isdir(subject_path):
            image_files = os.listdir(subject_path)
            image_files = np.random.choice(image_files, min(images_per_subject, len(image_files)), replace=False)
            for image_file in image_files:
                if count >= 500:
                    break
                image_path = os.path.join(subject_path, image_file)
                img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
                img_flat = img.flatten()
                data.append(img_flat)
                labels.append(subject_folder)
                count += 1

for image_file in os.listdir(self_path):
    image_path = os.path.join(self_path, image_file)
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    img_flat = img.flatten()
    data.append(img_flat)
    labels.append("self")

data = np.array(data)
labels = np.array(labels)

data_standardized = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
cov_matrix = np.cov(data_standardized.T)
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

eigenvalues = eigenvalues.real
eigenvectors = eigenvectors.real

# 
top_eigenvectors_40 = eigenvectors[:, :40]
top_eigenvectors_80 = eigenvectors[:, :80]
top_eigenvectors_200 = eigenvectors[:, :200]

data_pca_40 = np.dot(data_standardized, top_eigenvectors_40)
data_pca_80 = np.dot(data_standardized, top_eigenvectors_80)
data_pca_200 = np.dot(data_standardized, top_eigenvectors_200)

# divide train and test set
train_ratio = 0.7
train_size = int(train_ratio * data.shape[0])
train_data, test_data = data_standardized[:train_size, :], data_standardized[train_size:, :]
train_labels, test_labels = labels[:train_size], labels[train_size:]


label_mapping = {label: i for i, label in enumerate(np.unique(train_labels))}
inverse_label_mapping = {i: label for label, i in label_mapping.items()}

# KNN
def knn_predict(train_data, train_labels, test_data):
    distances = cdist(test_data, train_data, 'euclidean')
    nearest_neighbors = np.argsort(distances, axis=1)[:, :3]
    
    train_labels_int = np.array([label_mapping[label] for label in train_labels])
    predicted_labels = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=train_labels_int[nearest_neighbors])
    
    predicted_labels = np.array([inverse_label_mapping[i] for i in predicted_labels])
    
    return predicted_labels

# test
for data_pca, dim in zip([data_pca_40, data_pca_80, data_pca_200], [40, 80, 200]):
    print(f"\nTesting with {dim}-dimensional:")
    
    train_predicted_labels = knn_predict(data_pca[:train_size, :], train_labels, data_pca[:train_size, :])
    train_accuracy = np.mean(train_predicted_labels == train_labels)
    test_predicted_labels = knn_predict(data_pca[:train_size, :], train_labels, data_pca[train_size:, :])
    test_accuracy = np.mean(test_predicted_labels == test_labels)
    
    print(f"Train Accuracy = {train_accuracy:.2%}, Test Accuracy = {test_accuracy:.2%}")



Testing with 40-dimensional:
Train Accuracy = 71.71%, Test Accuracy = 0.65%

Testing with 80-dimensional:
Train Accuracy = 73.39%, Test Accuracy = 0.65%

Testing with 200-dimensional:
Train Accuracy = 74.23%, Test Accuracy = 0.65%


In [4]:
# method 2
import os
import cv2
import numpy as np
from scipy.spatial.distance import cdist

pie_path = r'D:\University\NUS\EE5907\PIE'
self_path = r'D:\University\NUS\EE5907\self'

data = []
labels = []

count = 0
selected_subjects = set()
images_per_subject = 20

while count < 500:
    subject_folder = np.random.choice(os.listdir(pie_path))
    if subject_folder not in selected_subjects:
        selected_subjects.add(subject_folder)
        subject_path = os.path.join(pie_path, subject_folder)
        if os.path.isdir(subject_path):
            image_files = os.listdir(subject_path)
            image_files = np.random.choice(image_files, images_per_subject, replace=False)
            for i, image_file in enumerate(image_files):
                image_path = os.path.join(subject_path, image_file)
                img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
                img_flat = img.flatten()
                data.append(img_flat)
                labels.append(subject_folder)
                count += 1
                if i >= 13:
                    break

# load self
self_images = os.listdir(self_path)
self_images = np.random.choice(self_images, 10, replace=False)
for i, image_file in enumerate(self_images):
    image_path = os.path.join(self_path, image_file)
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    img_flat = img.flatten()
    data.append(img_flat)
    labels.append("self")
    if i >= 6:
        break

data = np.array(data)
labels = np.array(labels)

data_standardized = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
cov_matrix = np.cov(data_standardized.T)
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

eigenvalues = eigenvalues.real
eigenvectors = eigenvectors.real

top_eigenvectors_40 = eigenvectors[:, :40]
top_eigenvectors_80 = eigenvectors[:, :80]
top_eigenvectors_200 = eigenvectors[:, :200]

data_pca_40 = np.dot(data_standardized, top_eigenvectors_40)
data_pca_80 = np.dot(data_standardized, top_eigenvectors_80)
data_pca_200 = np.dot(data_standardized, top_eigenvectors_200)

# label
unique_labels = np.unique(labels)
label_mapping = {label: i for i, label in enumerate(unique_labels)}
inverse_label_mapping = {i: label for label, i in label_mapping.items()}

# devide test set
train_data_indices = np.array([], dtype=int)
test_data_indices = np.array([], dtype=int)

for subject_folder in selected_subjects:
    subject_indices = np.where(labels == subject_folder)[0]
    train_data_indices = np.concatenate((train_data_indices, subject_indices[:14]))
    test_data_indices = np.concatenate((test_data_indices, subject_indices[14:]))

train_data_indices = np.concatenate((train_data_indices, np.where(labels == "self")[0][:7]))
test_data_indices = np.concatenate((test_data_indices, np.where(labels == "self")[0][7:]))

# knn
def knn_predict(train_data, train_labels, test_data):
    distances = cdist(test_data, train_data, 'euclidean')
    nearest_neighbors = np.argsort(distances, axis=1)[:, :3]
    
    train_labels_int = np.array([label_mapping[label] for label in train_labels])
    
    # labei
    predicted_labels = np.empty((test_data.shape[0],), dtype=train_labels_int.dtype)
    
    for i, neighbors in enumerate(nearest_neighbors):
        if len(neighbors) > 0:
            predicted_labels[i] = np.argmax(np.bincount(train_labels_int[neighbors]))
        else:
            predicted_labels[i] = -1 
    
    predicted_labels = np.array([inverse_label_mapping[i] for i in predicted_labels])
    
    return predicted_labels

# divide set
train_data = data_standardized[train_data_indices, :]
test_data = data_standardized[test_data_indices, :]
train_labels = labels[train_data_indices]
test_labels = labels[test_data_indices]

# test
for data_pca, dim in zip([data_pca_40, data_pca_80, data_pca_200], [40, 80, 200]):
    print(f"\nTesting with {dim}-dimensional PCA data:")
    
    train_predicted_labels = knn_predict(data_pca[train_data_indices, :], train_labels, data_pca[train_data_indices, :])
    train_accuracy = np.mean(train_predicted_labels == train_labels)
    
    test_predicted_labels = knn_predict(data_pca[train_data_indices, :], train_labels, data_pca[test_data_indices, :])
    test_accuracy = np.mean(test_predicted_labels == test_labels)
    
    print(f"Train Accuracy = {train_accuracy:.4f}, Test Accuracy = {test_accuracy:.4f}")



Testing with 40-dimensional PCA data:
Train Accuracy = 0.6164, Test Accuracy = 0.0000

Testing with 80-dimensional PCA data:
Train Accuracy = 0.6282, Test Accuracy = 0.0000

Testing with 200-dimensional PCA data:
Train Accuracy = 0.6419, Test Accuracy = 0.0000


  test_accuracy = np.mean(test_predicted_labels == test_labels)
