In [13]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_lfw_people
from sklearn.datasets import fetch_olivetti_faces
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import cv2
import random
from skimage.color import rgb2gray
import pickle
import os
dirname = os.path.abspath('')
datasets_path = os.path.join(dirname, 'Datasets')
olivetti_path = os.path.join(datasets_path, 'Olivetti')
parameters_path = os.path.join(dirname, 'PCA Parameters\Olivetti')

In [14]:
# ---------------------------- LOAD OLIVIETTA DATASET ---------------------------- #

lfw_people = fetch_olivetti_faces(data_home=olivetti_path, shuffle=True, random_state=47)

In [15]:
images = lfw_people.images
print('Images shape:',images.shape)

total_images, height, width = images.shape
print(f'Each image has size: {height} x {width}')

print(70* '-')

n_features = height*width
print(f'N^2 = n_features = h x w = {n_features}')

print(70* '-')

y = lfw_people.target
print('y has shape:', y.shape)

num_people = np.max(y) + 1
print('Number of people =', num_people)

Images shape: (400, 64, 64)
Each image has size: 64 x 64
----------------------------------------------------------------------
N^2 = n_features = h x w = 4096
----------------------------------------------------------------------
y has shape: (400,)
Number of people = 40


## Train Test Split

In [16]:
#----------------------- CREATE PEOPLE DICTIONARY -----------------------#

# Key: person ID
# Value: List of all person images indices
person_image_dict = dict()

for image_index in range(total_images):
    if (y[image_index] not in person_image_dict.keys()):
        person_image_dict[y[image_index]] = [image_index]
    else:
        person_image_dict[y[image_index]].append(image_index)

In [17]:
train_images = []
test_images = []
y_train = []
y_test = []

# guarantee that each person has at least 1 image in test set
for image_index_lst in person_image_dict.values():
    total_size = len(image_index_lst)
    size_train = int(0.8 * total_size)
    for index in range(len(image_index_lst)):
        image = images[image_index_lst[index]]
        label = y[image_index_lst[index]]
        if (index < size_train):
            train_images.append(image)
            y_train.append(label)
        else:
            test_images.append(image)
            y_test.append(label)
    

train_images = np.array(train_images)
test_images = np.array(test_images)
y_train = np.array(y_train)
y_test = np.array(y_test)

print('Train images shape:', train_images.shape)
print('Test images shape:', test_images.shape)

m = train_images.shape[0]
m_test = test_images.shape[0]
print(70* '-')
print('Number of train images:', m)
print('Number of test images:', m_test)


Train images shape: (320, 64, 64)
Test images shape: (80, 64, 64)
----------------------------------------------------------------------
Number of train images: 320
Number of test images: 80


In [18]:
images = lfw_people.images
print('Images shape:',images.shape)

total_images, height, width = images.shape
print(f'Each image has size: {height} x {width}')

print(70* '-')

m = int(total_images * 0.8)
print('M = Number of examples = ', m)

m_test = total_images - m
print('M_test = Number of test examples = ', m_test)

train_images = images[:int(m)]
print('Train images shape:', train_images.shape)

test_images = images[int(m):]
print('Test images shape:', test_images.shape)

n_features = height*width
print(f'N^2 = n_features = h x w = {n_features}')

print(70* '-')

y = lfw_people.target
print('y has shape:', y.shape)

y_train = y[:int(m)]
print('y_train has shape:', y_train.shape)

y_test = y[int(m):]
print('y_test has shape:', y_test.shape)

# target_names = lfw_people.target_names
num_people = np.max(y) + 1
print('Number of people =', num_people)

Images shape: (400, 64, 64)
Each image has size: 64 x 64
----------------------------------------------------------------------
M = Number of examples =  320
M_test = Number of test examples =  80
Train images shape: (320, 64, 64)
Test images shape: (80, 64, 64)
N^2 = n_features = h x w = 4096
----------------------------------------------------------------------
y has shape: (400,)
y_train has shape: (320,)
y_test has shape: (80,)
Number of people = 40


## PCA

In [19]:
def extract_pca_features(images, load=False, num_pca_components=0.95):
        image_vectors = []
        for image in images:
            image_vectors.append(image.flatten())
        image_vectors = np.array(image_vectors)
        
        if load:
            pca = pickle.load(open(parameters_path + "\olivetti_pca.pkl", "rb"))
            pca_features = pca.transform(image_vectors)
            return pca_features
        else:
            print("Creating new PCA model...")
            pca = PCA(n_components = num_pca_components, svd_solver = 'full')
            pca.fit(image_vectors)

            pca_features = pca.transform(image_vectors)

            pca_features = np.array(pca_features)

            pickle.dump(pca, open(parameters_path + "\olivetti_pca.pkl", "wb"))
            
            return pca_features

In [20]:
pca_features_train = extract_pca_features(train_images,load=False, num_pca_components=0.95)
pca_features_test = extract_pca_features(test_images,load=True, num_pca_components=0.95)
print(f'PCA Features Train: {pca_features_train.shape}')
print(f'PCA Features Test: {pca_features_test.shape}')

Creating new PCA model...
PCA Features Train: (320, 112)
PCA Features Test: (80, 112)


In [21]:
def convert_image(image, width, height):
    # if image is RGB, convert to grayscale
    if len(image.shape) > 2:
        image = image[:,:,:3]
        image = rgb2gray(image)
    # resize image to width*height
    image = cv2.resize(image, (width, height))
    if (np.max(image) > 1):
        image = image / 255.0
    image = image.reshape((1, width*height))

    return image

## Testing

### KNN

In [22]:
def calculateDistance(x1, x2):
    distance = np.linalg.norm(x1-x2)
    return distance

def KNN(test_point, training_features, labels, k):
    distances = [calculateDistance(test_point, p) for p in training_features]
    
    #k_nearest: numpy array of size k holding indices of k nearest images
    k_nearest = np.argpartition(distances, k)[:k]
    # print(k_nearest)
    
    votes = np.zeros(num_people)

    for i in k_nearest:
        # print(i)
        # print(labels[i])
        # print('---')
        votes[labels[i]] += 1

    # print('---------------\n',votes)
    classification = np.argmax(votes)

    return classification

In [23]:
index = 0
test_point = pca_features_test[index]

classification = KNN(test_point, pca_features_train, y_train, 5)

print(f'Predicted: {classification}, Actual: {y_test[index]}')

Predicted: 11, Actual: 11


In [24]:
count = 0
for index in range(m_test):
    # Get image
    test_point = pca_features_test[index]

    classification = KNN(test_point, pca_features_train, y_train, 5)
    
    if(classification == y_test[index]):
        count += 1
    
print(f'Accuracy: {count/m_test*100}%')

Accuracy: 75.0%
