In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Set a seed for reproducibility
np.random.seed(42)

import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.datasets import fetch_olivetti_faces

# Load the Olivetti faces dataset
faces_data = fetch_olivetti_faces(shuffle=True, random_state=42)
faces = faces_data.data
targets = faces_data.target

- Rows = Individual face images (400 total)
- Columns = Pixel positions (4,096) plus person identifier
- Cells = Grayscale intensity values for each pixel

In [None]:
# data defition
n_samples, n_features = faces.shape
n_faces = len(np.unique(targets))
print(f"Dataset: {n_samples} images")
print(f"Image size: 64x64 pixels = {n_features} dimensions")
print(f"Number of different people: {n_faces}")

In [None]:
# Create a DataFrame with the face data

# First, create a DataFrame with the pixel values
pixel_columns = [f'pixel_{i}' for i in range(faces.shape[1])]
faces_df = pd.DataFrame(faces, columns=pixel_columns)

# Add the target (person identifier) as a column
faces_df['person_id'] = targets

print("First 5 rows of the DataFrame:")
faces_df.head()

We're displaying actual photos of real people from the dataset - specifically, the first 12 images from the Olivetti faces dataset

In [None]:
# Display some original face images
n_row, n_col = 3, 4
plt.figure(figsize=(2. * n_col, 2.26 * n_row))
plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)

for i in range(n_row * n_col):
    plt.subplot(n_row, n_col, i + 1)
    plt.imshow(faces[i].reshape((64, 64)), cmap=plt.cm.gray)
    plt.title(f"Person #{targets[i]}", size=12)
    plt.xticks(())
    plt.yticks(())

plt.suptitle("Original Face Images (64x64 pixels = 4,096 dimensions each)",
             fontsize=16)
plt.show()

The original images above are represented in **4,096 dimensions (64 × 64 pixels)**.

The images below have been reconstructed using only **66 principal components**. Despite this significant reduction in dimensionality, the reconstructed images retain most of the important visual information, demonstrating how PCA effectively captures the underlying structure of the data.

In [None]:
from sklearn.decomposition import PCA

# Apply PCA to reduce to 66 components
n_components = 66
pca = PCA(n_components=n_components, whiten=True, random_state=42)
faces_pca = pca.fit_transform(faces)
faces_reconstructed = pca.inverse_transform(faces_pca)

# Plot the reconstructed faces
plt.figure(figsize=(2. * n_col, 2.26 * n_row))
plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)

for i in range(n_row * n_col):
    plt.subplot(n_row, n_col, i + 1)
    plt.imshow(faces_reconstructed[i].reshape((64, 64)), cmap=plt.cm.gray)
    plt.title(f"Person #{targets[i]}", size=12)
    plt.xticks(())
    plt.yticks(())

plt.suptitle(f"Reconstructed Faces Using {n_components} PCA Components", fontsize=16)
plt.show()

> Applications: Face recognition

  - Netflix Recommendation System (latent space): Compressing massive user-item matrix into a lower-dimensional latent space.
  - Intel Sensor Compression
  - Gene Expression Analysis (NIH)

<img src="https://hellopm.co/wp-content/uploads/2024/07/hipertextual-si-te-vas-netflix-no-olvides-descargar-mi-actividad-mi-lista-2019814675.webp" width=500>