<a href="https://colab.research.google.com/github/abdelmotlb/Machine-Learning-Algorithms/blob/main/Dimensionality-Reduction-with-application.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# <font color='orange' size='7px'> ***Global***</font>

## *Libraries*

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from google.colab import drive
import math
from PIL import Image


## *Variables*

In [None]:
numberOfImages = 400
ImageWidth = 112
ImageHeight = 92
maxbrightness = 255
numberOfFeatures = ImageWidth * ImageHeight
defaultPath = "/content"
facesPath = "/content/g"

# <font color='orange' size='7px'> ***Kaggle Data Configuration***</font>

## Hold content folder by kaggle


In [None]:
os.environ['KAGGLE_CONFIG_DIR'] = '/content/'

##  Faces Landing


In [None]:
!kaggle datasets download -d kasikrit/att-database-of-faces

Downloading att-database-of-faces.zip to /content
  0% 0.00/3.61M [00:00<?, ?B/s]
100% 3.61M/3.61M [00:00<00:00, 44.0MB/s]


In [None]:
import zipfile

# create faces directory
os.mkdir(facesPath)
with zipfile.ZipFile("att-database-of-faces.zip", 'r') as zip_ref:
  os.chdir(facesPath)
  zip_ref.extractall()

# reset default unzip directory
os.chdir(defaultPath)

## Import Non-Faces images from Kaggle


In [None]:
!kaggle datasets download -d theblackmamba31/landscape-image-colorization

Downloading landscape-image-colorization.zip to /content
100% 192M/192M [00:02<00:00, 112MB/s] 
100% 192M/192M [00:02<00:00, 82.9MB/s]


In [None]:
import zipfile

with zipfile.ZipFile("landscape-image-colorization.zip", 'r') as zip_ref:
  zip_ref.extractall()
# create a folder to hold all non-face images in

parent_dir = "/content/"
s0_dir = os.path.join(parent_dir, "s0")
s0s0_dir = os.path.join(s0_dir, "s0")
# Create the parent directory if it doesn't exist
if not os.path.exists(s0_dir):
  os.mkdir(s0_dir)
# Create the target directory
if not os.path.exists(s0s0_dir):
  os.mkdir(s0s0_dir)

# <font color='orange' size='7px'> ***Dataset Loader***</font>

## *Image reader*

In [None]:
# read specific image.
def readImage(filename):
    with open(filename, 'rb') as f:
        # Read magic number and skip it
        f.readline()
        # Read w, h
        width, height = map(int, f.readline().split())
        maxval = int(f.readline().strip())

        # Read pixel data
        pixels = [list(f.read(width)) for _ in range(height)]

    return width, height, maxval, pixels

## *Helper Functions*

In [None]:
import matplotlib.pyplot as plt

def getSubject(dirname):
    folder_name = os.path.basename(dirname)
    ImageSubject = int(folder_name[1:])
    return ImageSubject

def getCurrentPathOfFile(dirname, filename):
    return os.path.join(dirname, filename)

def generateColumnsNames():
    return ['p' + str(i) for i in range(40)]

def formulatedImage(pixels):
    return np.array(pixels).reshape(-1)

def plot_image(image_array, resized):
  '''
  parameter: 1d-array, bool value
  '''
  if resized:
    plt.imshow(image_array.reshape(112 // 2, 92 // 2), cmap='gray')  # Use 'cmap=None' if the image is colored
  else:
    plt.imshow(image_array.reshape(112, 92), cmap='gray')
  plt.axis('off')  # Turn off axis
  plt.show()

## *Data Loader*

In [None]:
def resize(dataMatrix):
  new_data_matrix = np.zeros([numberOfImages, int(numberOfFeatures / 4)])
  for image in range(numberOfImages):
    count = 0
    for row in range(0,ImageHeight,2):
      for col in range(0,ImageWidth,2):
        new_data_matrix[image][count] = (dataMatrix[image][row*ImageWidth+col] + dataMatrix[image][(row+1)*ImageWidth+col] + dataMatrix[image][row*ImageWidth+col+1] + dataMatrix[image][(row+1)*ImageWidth+col+1])/4
        count = count + 1
  return new_data_matrix

In [None]:
import matplotlib.pyplot as plt

def plot_image(image_array):
    plt.imshow(image_array.reshape(112, 92), cmap='gray')  # Use 'cmap=None' if the image is colored
    plt.axis('off')  # Turn off axis
    plt.show()

In [None]:
import imageio as img

def loadDataset(absoluteDirectoryPath):

    dataMatrix = np.empty((numberOfImages, numberOfFeatures))
    labels = np.empty(numberOfImages)

    instanceCounter = 0
    for dirname, _, filenames in os.walk(absoluteDirectoryPath):
        for filename in sorted(filenames):
          file_name, file_extension = os.path.splitext(filename)
          if file_extension != ".pgm":
            continue

          width, height, maxval, pixels = readImage(getCurrentPathOfFile(dirname, filename))
          dataMatrix[instanceCounter] = formulatedImage(pixels)
          labels[instanceCounter] = getSubject(dirname)
          instanceCounter = instanceCounter + 1

    dataMatrix = resize(dataMatrix)
    return dataMatrix, labels

## Dataset splitter

In [None]:
def split_dataset(matrix, labels, train_ratio):
  n = math.ceil(1.0/(1-train_ratio))

  combined_data = list(zip(matrix, labels))
  sorted_data = sorted(combined_data, key=lambda x: x[1])

  # Unpack the sorted data into separate matrices
  sorted_matrix_2d = np.array([row for row, label in sorted_data])
  sorted_labels = np.array([label for row, label in sorted_data])
  sorted_matrix_3d = [[] for _ in range(41)]
  X_test = []
  X_labels = []
  y_test = []

  for i in range(int(len(sorted_matrix_2d)/10)):
    for j in range(10):
      if(j % n):
        sorted_matrix_3d[int(sorted_labels[i*10+j])].append(sorted_matrix_2d[i*10+j])
        y_test.append(int(sorted_labels[i*10+j]))
      else:
        X_test.append(sorted_matrix_2d[i*10+j])
        X_labels.append(int(sorted_labels[i*10+j]))

  return sorted_matrix_3d, y_test, X_test, X_labels

# <font color='orange' size='7px'> ***PCA***</font>

## *Basic*

### PCA sub functions

In [None]:
def center_data(data_matrix):
  data_matrix_mean = np.mean(data_matrix, axis=0)
  return data_matrix - data_matrix_mean

def sort_eigen_values_and_vectors(eigen_values, eigen_vectors):
  sorted_indices = np.argsort(eigen_values)[::-1]
  sorted_eigen_values = eigen_values[sorted_indices]
  sorted_eigen_vectors = eigen_vectors[:, sorted_indices]
  return sorted_eigen_values, sorted_eigen_vectors

def choose_r(sorted_eigen_values, alpha):
  sum_all_eigen_values = np.sum(sorted_eigen_values)
  sum_till_r = 0.0
  r = sorted_eigen_values.size
  for i in range(sorted_eigen_values.size) :
    sum_till_r = sum_till_r + sorted_eigen_values[i]
    if sum_till_r / sum_all_eigen_values >= alpha:
      r = i
      break;
  return r + 1


### PCA main function

In [None]:
def calc_eig_vector(data_matrix):
  centered_data_matrix = center_data(data_matrix)

  cov_matrix = np.cov(centered_data_matrix.T, bias=True)

  eigen_values, eigen_vectors = np.linalg.eigh(cov_matrix)

  return sort_eigen_values_and_vectors(eigen_values, eigen_vectors)

def reduce_dimentions(sorted_eigen_values, sorted_eigen_vectors, alpha):
  r = choose_r(sorted_eigen_values, alpha)
  return sorted_eigen_vectors[:, :r]


## After dimentions reduction

In [None]:
def project(basis, data):
  ans = np.dot(basis.T, data.T).T
  return ans

def project_all_data(new_basis, training_data, testinig_data):
  new_training_data = project(new_basis, training_data)
  new_testinig_data = project(new_basis, testinig_data)
  return new_training_data.real, new_testinig_data.real


In [None]:
def cluster(new_training_data, training_labels, new_testinig_data, testing_labels, num_neigbours=1):
  knn_classifier = KNeighborsClassifier(n_neighbors=num_neigbours)
  knn_classifier.fit(new_training_data, training_labels)
  predictions = knn_classifier.predict(new_testinig_data)
  accuracy = accuracy_score(testing_labels, predictions)
  report = classification_report(testing_labels, predictions)

  print(predictions.shape, testing_labels.shape)

  incorrect_indices = [i for i, (true_label, predicted_label) in enumerate(zip(testing_labels, predictions)) if true_label != predicted_label]

  print("Indices of incorrect predictions:")
  print(incorrect_indices)

  return accuracy, report, incorrect_indices

# <font color='orange' size='7px'> ***LDA***</font>

## *Basic*

In [None]:
import scipy.linalg as la

class LDA:

  def __init__(self, num_features, num_dominant_vectors):
    self.num_features = num_features
    self.num_dominate_vectors = num_dominant_vectors
    self.num_classes = None
    self.projection_matrix = None

  def calc_means(self, matrix):
    means = np.zeros((self.num_classes, self.num_features))
    num_samples = np.zeros(self.num_classes)

    for i in range(1, self.num_classes):
      means[i] = np.mean(np.array(matrix[i]), axis = 0)
      num_samples[i] = len(matrix[i])

    overall_mean = np.mean(means[1:], axis = 0)
    return means, overall_mean, num_samples

  def calc_between_class_matrix(self, num_samples, classes_means, overall_mean):
    between_class_matrix = np.zeros((self.num_features,self.num_features))

    for i in range(1, self.num_classes):
      x = np.array(classes_means[i])
      diff_matrix = x - overall_mean
      c = num_samples[i] * np.outer(diff_matrix, diff_matrix)
      between_class_matrix += c

    return between_class_matrix

  def calc_within_class_matrix(self, matrix, classes_means):
    within_class_matrix = np.zeros((self.num_features, self.num_features))

    for i in range(1, self.num_classes):
      class_data = np.array(np.array(matrix[i]) - np.array(classes_means[i]))
      scatter_matrix = np.dot((class_data).T, class_data)
      within_class_matrix += scatter_matrix

    return within_class_matrix

  def fit(self, matrix):

    # set number of classes of the dataset
    self.num_classes = len(matrix)

    # calculate the means vector for each class
    classes_means, overall_mean, num_samples = self.calc_means(matrix)

    # calculate the between-class scatter matrix
    between_class_matrix = self.calc_between_class_matrix(num_samples, classes_means, overall_mean)

    # calculate the within-class scatter matrix
    within_class_matrix = self.calc_within_class_matrix(matrix, classes_means)

    # calculate eigenvalues and eigenvectors
    eigenvalues, eigenvectors = la.eig(la.pinv(within_class_matrix).dot(between_class_matrix))

    # eigenvectors without complex part
    eigenvectors = np.real(eigenvectors)

    # sort eigenvalues and eigenvectors
    sorted_indices = np.argsort(eigenvalues)[::-1]

    # set the projection matrix
    self.projection_matrix = eigenvectors[:, sorted_indices[:self.num_dominate_vectors]]

  def tranform(self, matrix):
    self.new_matrix = []
    for i in range(len(matrix)):
      if(matrix[i] is None):
        continue
      class_data = np.array(matrix[i])
      for j in range(len(class_data)):
        x = np.dot(class_data[j], self.projection_matrix)
        self.new_matrix.append(x)


  def predict(self, new_matrix_labels, X_test, X_labels, num_neigbours):
    # Create a KNN classifier (you can adjust the 'n_neighbors' parameter)
    knn_classifier = KNeighborsClassifier(n_neighbors=num_neigbours)

    # Train the classifier on the training data
    knn_classifier.fit(self.new_matrix, new_matrix_labels)

    # project test data
    test = []
    for i in range(len(X_test)):
      class_data = np.array(X_test[i])
      c = np.dot(class_data, self.projection_matrix)
      test.append(c)

    # Make predictions on the test data
    predictions = knn_classifier.predict(test)

    # Evaluate the performance of the classifier
    accuracy = accuracy_score(X_labels, predictions)

    return predictions, accuracy



# <font color='orange' size='7px'> ***Basic algorithms driver Code***</font>

## *PCA Basic version Runner*

In [None]:
def convert_lsit_to_nparray(train_ratio, path="/content/g"):
  dataMatrix, labels = loadDataset(path)
  training_data, trainig_labels, testing_data, testing_labels = split_dataset(dataMatrix, labels, train_ratio)

  x = []
  for image in training_data:
    for row in image:
      x.append(row)

  training_data = np.array(x)

  y = []
  for image in testing_data:
    for row in image:
      y.append(row)
  testing_data = np.array(y)
  testing_data = testing_data.reshape(int((1-train_ratio) * numberOfImages), int(numberOfFeatures / 4))

  trainig_labels = np.array(trainig_labels)
  testing_labels = np.array(testing_labels)

  return training_data, trainig_labels, testing_data, testing_labels


In [None]:
import time

def pca(train_ratio, path="/content/g", k=1):
  training_data, trainig_labels, testing_data, testing_labels = convert_lsit_to_nparray(train_ratio, path)
  prev_time = time.time()  # Record the starting time
  sorted_eigen_values, sorted_eigen_vectors = calc_eig_vector(training_data)
  current_time = time.time()  # Record the current time
  pca_time = current_time - prev_time  # Calculate execution time
  alphas = [0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95]
  for alpha in alphas:
    print(f"PCA with alpha = {alpha}")
    prev_time = time.time()  # Record the starting time
    new_basis = reduce_dimentions(sorted_eigen_values, sorted_eigen_vectors, alpha)
    new_training_data, new_testinig_data = project_all_data(new_basis, training_data, testing_data)
    accuracy, report, incorrect_indices = cluster(new_training_data, trainig_labels, new_testinig_data, testing_labels, k)
    current_time = time.time()  # Record the current time
    execution_time = current_time - prev_time  # Calculate execution time
    print(f"Execution time for alpha {alpha}: {execution_time + pca_time} seconds")
    print(f"Accuracy = {accuracy}")
    # print(f"Report is")
    # print(report)

In [None]:
pca(0.5)

PCA with alpha = 0.5
(200,) (200,)
Indices of incorrect predictions:
[0, 15, 18, 31, 34, 35, 36, 37, 45, 58, 75, 78, 81, 90, 91, 97, 98, 99, 100, 111, 114, 127, 130, 131, 133, 139, 142, 144, 146, 147, 149, 151, 153, 154, 155, 157, 170, 171, 172, 173, 174, 178, 182, 185, 188, 189]
Execution time for alpha 0.5: 6.122479438781738 seconds
Accuracy = 0.77
PCA with alpha = 0.6
(200,) (200,)
Indices of incorrect predictions:
[0, 18, 31, 45, 75, 78, 87, 90, 91, 100, 111, 114, 130, 133, 134, 139, 144, 151, 153, 154, 164, 170, 171, 172, 173, 174, 176, 178, 182, 185, 187, 188, 189, 194, 195, 199]
Execution time for alpha 0.6: 6.117707967758179 seconds
Accuracy = 0.82
PCA with alpha = 0.7
(200,) (200,)
Indices of incorrect predictions:
[0, 45, 75, 78, 97, 98, 111, 114, 139, 170, 171, 172, 174, 195]
Execution time for alpha 0.7: 6.115548133850098 seconds
Accuracy = 0.93
PCA with alpha = 0.8


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(200,) (200,)
Indices of incorrect predictions:
[0, 75, 78, 97, 98, 139, 170, 171, 172, 174, 199]
Execution time for alpha 0.8: 6.232375860214233 seconds
Accuracy = 0.945
PCA with alpha = 0.85
(200,) (200,)
Indices of incorrect predictions:
[0, 75, 78, 98, 139, 170, 171, 172, 174, 199]
Execution time for alpha 0.85: 6.210130453109741 seconds
Accuracy = 0.95
PCA with alpha = 0.9
(200,) (200,)
Indices of incorrect predictions:
[0, 75, 78, 98, 139, 170, 171, 172, 174, 199]
Execution time for alpha 0.9: 6.11403751373291 seconds
Accuracy = 0.95
PCA with alpha = 0.95
(200,) (200,)
Indices of incorrect predictions:
[0, 75, 78, 139, 170, 171, 172, 174, 198, 199]
Execution time for alpha 0.95: 6.1181700229644775 seconds
Accuracy = 0.95


In [None]:
pca(train_ratio=0.5,k=7)

PCA with alpha = 0.5
(200,) (200,)
Indices of incorrect predictions:
[0, 1, 2, 3, 10, 12, 13, 15, 18, 19, 20, 21, 22, 24, 31, 35, 36, 37, 42, 45, 46, 52, 55, 58, 59, 70, 72, 73, 74, 75, 77, 78, 79, 80, 81, 82, 87, 93, 95, 97, 98, 99, 110, 111, 114, 123, 124, 127, 129, 130, 131, 132, 133, 134, 135, 138, 139, 143, 146, 147, 149, 150, 151, 153, 154, 155, 156, 157, 158, 160, 161, 162, 163, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183, 184, 185, 187, 188, 189, 197]
Execution time for alpha 0.5: 7.121819496154785 seconds
Accuracy = 0.525
PCA with alpha = 0.6
(200,) (200,)
Indices of incorrect predictions:
[0, 15, 16, 19, 21, 31, 35, 36, 37, 40, 45, 46, 52, 55, 56, 57, 58, 59, 70, 72, 73, 74, 75, 77, 78, 80, 81, 82, 87, 95, 96, 97, 99, 101, 110, 111, 112, 113, 114, 120, 122, 129, 131, 132, 133, 134, 135, 138, 139, 143, 144, 150, 151, 153, 154, 155, 156, 157, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183, 184, 187, 188, 189, 197]
Executi

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(200,) (200,)
Indices of incorrect predictions:
[0, 1, 59, 70, 72, 73, 77, 80, 81, 82, 96, 98, 110, 131, 135, 136, 137, 138, 139, 143, 150, 153, 154, 156, 157, 162, 163, 169, 170, 171, 172, 173, 174, 175, 177, 179, 181, 182, 183, 184, 197, 198, 199]
Execution time for alpha 0.9: 7.136718273162842 seconds
Accuracy = 0.785
PCA with alpha = 0.95
(200,) (200,)
Indices of incorrect predictions:
[0, 1, 59, 70, 72, 73, 77, 80, 81, 82, 98, 110, 111, 131, 135, 136, 137, 138, 139, 143, 150, 153, 154, 156, 157, 162, 163, 169, 170, 171, 172, 173, 174, 175, 176, 177, 179, 181, 182, 183, 184, 197, 198]
Execution time for alpha 0.95: 7.211335897445679 seconds
Accuracy = 0.785


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## *LDA Basic version Runner*

In [None]:
# prompt: test loaded data
import time

def Lda(train_ratio, num_neighbors):
  # Load the dataset
  dataMatrix, labels = loadDataset("/content/g")
  new_mat, new_labels, test_images, test_labels = split_dataset(dataMatrix, labels, train_ratio)
  prev_time = time.time()  # Record the starting time
  test_LDA = LDA(int(numberOfFeatures/4), 39)
  test_LDA.fit(new_mat)
  test_LDA.tranform(new_mat)
  predictions, accuracy = test_LDA.predict(new_labels, test_images, test_labels, num_neighbors)
  current_time = time.time()  # Record the current time
  execution_time = current_time - prev_time  # Calculate execution time
  print(f"Accuracy: {accuracy:.2f}")
  print(f"Execution time: {execution_time} seconds")
  # You can also print a classification report for more detailed metrics
  # print("Classification Report:\n")



In [None]:
Lda(0.5, 1)
Lda(0.5, 3)
Lda(0.5, 5)
Lda(0.5, 7)
Lda(0.5, 11)

Accuracy: 0.97
Execution time: 43.94591021537781 seconds
Accuracy: 0.96
Execution time: 43.52052640914917 seconds
Accuracy: 0.96
Execution time: 44.77238917350769 seconds
Accuracy: 0.96
Execution time: 43.85331916809082 seconds
Accuracy: 0.93
Execution time: 44.999974489212036 seconds


# <font color='orange' size='7px'> ***Face vs non-face***</font>

## Convert png images to pgm and read them

In [None]:
def jpg_to_pgm(jpg_path, pgm_path):
  """
  Converts a JPG image to a PGM image using NumPy and Pillow.

  Args:
      jpg_path (str): Path to the JPG image.
      pgm_path (str): Path to save the PGM image.
  """
  # Open the JPG image with Pillow and convert to grayscale
  img = Image.open(jpg_path).convert('L')

  resized_img = img.resize((ImageWidth, ImageHeight), resample=Image.ANTIALIAS)

  # Get image data as a NumPy array
  resized_data = np.array(img)

  # Prepare PGM header (assuming maximum grayscale value is 255)
  header = f"P5\n{ImageWidth} {ImageHeight}\n255\n"

  # Save the PGM image
  with open(pgm_path, 'wb') as f:
    f.write(header.encode("utf-8"))
    f.write(resized_data.flatten())


In [None]:
def convert_folder_jpg_to_pgm():
  abs_path = "/content/landscape Images/gray/"
  count = 0
  for dirname, _, filenames in os.walk(abs_path):
      for filename in sorted(filenames):

        new_file_name, file_extension = os.path.splitext(filename)
        if file_extension != ".jpg":
          continue
        count = count + 1
        if count == 401:
          return
        jpg_to_pgm(abs_path + filename, "/content/s0/s0/"  + new_file_name+".pgm")


In [None]:
convert_folder_jpg_to_pgm()

  resized_img = img.resize((ImageWidth, ImageHeight), resample=Image.ANTIALIAS)


## PCA for Non-face vs Face

In [None]:
def pca_face_vs_non_face(training_data, trainig_labels, testing_data, testing_labels):
  sorted_eigen_values, sorted_eigen_vectors = calc_eig_vector(training_data)
  new_basis = reduce_dimentions(sorted_eigen_values, sorted_eigen_vectors, 0.8)
  new_training_data, new_testinig_data = project_all_data(new_basis, training_data, testing_data)
  accuracy, report, incorrect_indices = cluster(new_training_data, trainig_labels, new_testinig_data, testing_labels)
  print(f"Accuracy = {accuracy}")
  # for incorrect_index in incorrect_indices:
  #   plot_image(testing_data[incorrect_index], resized=True)
  # print(f"Report is")
  # print(report)


In [None]:
# read faces dataset
face_training, face_training_labels, face_testing, face_testing_labels = convert_lsit_to_nparray(0.7)
face_training_labels = np.ones(face_training_labels.shape[0])
face_testing_labels = np.ones(face_testing_labels.shape[0])

# read non-face dataset
non_face_training, non_face_training_labels, non_face_testing, non_face_testing_labels = convert_lsit_to_nparray(0.7, "/content/s0/")
print(len(non_face_training))
non_face_training_labels = np.zeros(non_face_training_labels.shape[0])
non_face_testing_labels = np.zeros(non_face_testing_labels.shape[0])

# form a testing dataset from both calsses !!!!!!! will not change
testing_data = np.concatenate((face_testing, non_face_testing), axis=0)
testing_labels = np.concatenate((face_testing_labels, non_face_testing_labels), axis=0)

non_face_pool_size =  non_face_training.shape[0]

non_face_ratios = np.array([0.3,0.5, 0.7, 1])
for non_face_ratio in non_face_ratios:
  print(f"Number of non-face images = {int(non_face_ratio * non_face_pool_size)} and number of faces = {face_training.shape[0]}")
  training_data = np.concatenate((face_training, non_face_training[:int(non_face_ratio * non_face_pool_size)]), axis=0)
  training_labels = np.concatenate((face_training_labels, non_face_training_labels[:int(non_face_ratio * non_face_pool_size)]), axis=0)
  pca_face_vs_non_face(training_data, training_labels, testing_data, testing_labels)

280
Number of non-face images = 84 and number of faces = 280
(240,) (240,)
Indices of incorrect predictions:
[179, 223]
Accuracy = 0.9916666666666667
Number of non-face images = 140 and number of faces = 280
(240,) (240,)
Indices of incorrect predictions:
[]
Accuracy = 1.0
Number of non-face images = 196 and number of faces = 280
(240,) (240,)
Indices of incorrect predictions:
[]
Accuracy = 1.0
Number of non-face images = 280 and number of faces = 280
(240,) (240,)
Indices of incorrect predictions:
[]
Accuracy = 1.0


## LDA for Non-face vs Face

In [None]:
def lda_data_config(face_training):
  images = []
  labels = []
  for i in range(len(face_training)):
    mat = face_training[i]
    for image in mat:
      images.append(image)
      labels.append(1)
  return images, labels

In [None]:
def divide_non_face(matrix, ratio):
  size = len(matrix)
  non_face_training = matrix[:int(size * ratio),:]
  non_face_testing = matrix[int(size * ratio):,:]
  non_face_training_labels = [2] * len(non_face_training)
  non_face_testing_labels = [2] * len(non_face_testing)
  return non_face_training, non_face_training_labels, non_face_testing, non_face_testing_labels


In [None]:
# load face dataset
data_matrix, labels = loadDataset("/content/g")
face_training, face_training_labels, face_testing, face_testing_labels = split_dataset(data_matrix, labels, 0.7)
face_training, face_training_labels = lda_data_config(face_training)
face_testing_labels = [1 for _ in face_testing_labels]

# load non face dataset
non_face, non_face_labels = loadDataset("/content/s0/")
print(len(non_face))
non_face_training, non_face_training_labels, non_face_testing, non_face_testing_labels = divide_non_face(non_face, 0.7)

# form a testing dataset from both calsses will not change
testing_data = np.concatenate((face_testing, non_face_testing), axis=0)
testing_labels = np.concatenate((face_testing_labels, non_face_testing_labels), axis=0)

non_face_pool_size =  non_face_training.shape[0]

non_face_ratios = np.array([0.3, 0.5, 0.7, 1])
for non_face_ratio in non_face_ratios:
  print(f"Number of non-face images = {int(non_face_ratio * non_face_pool_size)} and number of faces = {len(face_training)}")
  training_data = [None, face_training, non_face_training[:int(non_face_ratio * non_face_pool_size)]]
  training_labels = np.concatenate((face_training_labels, non_face_training_labels[:int(non_face_ratio * non_face_pool_size)]), axis=0)
  test_LDA = LDA(int(numberOfFeatures / 4), 1)
  test_LDA.fit(training_data)
  test_LDA.tranform(training_data)
  predictions, accuracy = test_LDA.predict(training_labels, testing_data, testing_labels, 1)
  print(f"Accuracy: {accuracy:.2f}")

  # You can also print a classification report for more detailed metrics
  # print("Classification Report:\n", report)

400
Number of non-face images = 84 and number of faces = 280
Accuracy: 0.87
Number of non-face images = 140 and number of faces = 280
Accuracy: 0.82
Number of non-face images = 196 and number of faces = 280
Accuracy: 0.84
Number of non-face images = 280 and number of faces = 280
Accuracy: 0.83


# <font color='orange' size='7px'> ***Bonus***</font>

## Updated Training Ratio PCA


In [None]:
pca(0.7)

PCA with alpha = 0.5
(120,) (120,)
Indices of incorrect predictions:
[0, 9, 20, 21, 22, 48, 52, 54, 57, 59, 70, 78, 85, 86, 92, 102, 103, 104, 113]
Execution time for alpha 0.5: 6.006243944168091 seconds
Accuracy = 0.8416666666666667
PCA with alpha = 0.6
(120,) (120,)
Indices of incorrect predictions:
[0, 78, 83, 102, 104, 112, 113]
Execution time for alpha 0.6: 5.989202499389648 seconds
Accuracy = 0.9416666666666667
PCA with alpha = 0.7
(120,) (120,)
Indices of incorrect predictions:
[0, 83, 102, 104]
Execution time for alpha 0.7: 5.989206552505493 seconds
Accuracy = 0.9666666666666667
PCA with alpha = 0.8


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(120,) (120,)
Indices of incorrect predictions:
[0, 83, 102]
Execution time for alpha 0.8: 6.089878797531128 seconds
Accuracy = 0.975
PCA with alpha = 0.85
(120,) (120,)
Indices of incorrect predictions:
[0, 83, 102]
Execution time for alpha 0.85: 5.991519212722778 seconds
Accuracy = 0.975
PCA with alpha = 0.9
(120,) (120,)
Indices of incorrect predictions:
[0, 83, 102]
Execution time for alpha 0.9: 5.991412878036499 seconds
Accuracy = 0.975
PCA with alpha = 0.95
(120,) (120,)
Indices of incorrect predictions:
[0, 83, 102]
Execution time for alpha 0.95: 5.9933998584747314 seconds
Accuracy = 0.975


## Updated Training Ratio LDA


In [None]:
Lda(0.7, 1)

Accuracy: 0.99


## PCA Variation

In [None]:
import numpy as np
from sklearn.datasets import make_moons
from sklearn.decomposition import KernelPCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Splitting data into training and testing sets
X_train, y_train, X_test, y_test = convert_lsit_to_nparray(0.7, "/content/g")
components = [18, 28, 46, 84]

for component in components:
  # Defining a kernel PCA pipeline with a K-Nearest Neighbors classifier
  kpca_knn_pipeline = Pipeline([
      ("scaler", StandardScaler()),
      ("kpca", KernelPCA(n_components=component, kernel='rbf')),  # Using RBF kernel
      ("knn", KNeighborsClassifier(n_neighbors=3))  # KNN classifier
  ])

  # Training the pipeline
  kpca_knn_pipeline.fit(X_train, y_train)

  # Evaluating the pipeline
  train_accuracy = kpca_knn_pipeline.score(X_train, y_train)
  test_accuracy = kpca_knn_pipeline.score(X_test, y_test)

  print(f"Train Accuracy: {train_accuracy:.2f}")
  print(f"Test Accuracy: {test_accuracy:.2f}")


# reference.1 https://www.baeldung.com/cs/intuition-behind-kernels-in-machine-learning#the-mathematics-of-kernels

# reference.2 https://www.baeldung.com/cs/kernel-principal-component-analysis

Train Accuracy: 0.96
Test Accuracy: 0.89
Train Accuracy: 0.97
Test Accuracy: 0.93
Train Accuracy: 0.97
Test Accuracy: 0.93
Train Accuracy: 0.97
Test Accuracy: 0.93


## LDA Variation

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score

ratio = [0.5,0.7]
for r in ratio:
  training_data, trainig_labels, testing_data, testing_labels = convert_lsit_to_nparray(r)

  fld = LinearDiscriminantAnalysis(solver='lsqr', shrinkage=0.1)
  fld.fit(training_data, trainig_labels)
  y_pred = fld.predict(testing_data)
  acc = accuracy_score(testing_labels, y_pred)

  print(f"With split ratio = {r} Accuracy: {acc}")

With split ratio = 0.5 Accuracy: 0.96
With split ratio = 0.7 Accuracy: 0.9916666666666667
