In [None]:
import os
from matplotlib import pyplot as plt
from PIL import Image

import pandas as pd
import numpy as np

from sklearn.datasets import fetch_olivetti_faces
from sklearn.decomposition import PCA


import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [None]:
df_faces = pd.DataFrame(data=fetch_olivetti_faces()['data'])

In [None]:
# This dataset has 400 images of size 64x64=4096. 
# Notice that in the earlier datasets, we had more data points and lesser features.
# But now the situation is reverse!

df_faces

In [None]:
# Plot the image in the first row of the df_faces DataFrame
# We need to first reshape the array of pixels into 64x64

image_original = df_faces.iloc[0].values.reshape([64,64])
px.imshow(image_original)

In [None]:
# If we try to do PCA with 4096 components, we will get an error since the number data points is only 400.
# So in PCA, the maximum number of components is min(#datapoints, #features)

pca = PCA(4096)
faces_proj = pca.fit_transform(df_faces)

In [None]:
# So we can take a maximum of 400 components in our PCA analysis

pca = PCA(400)
faces_proj = pca.fit_transform(df_faces)

In [None]:
# Now lets do a plot to see how much variance is accounted for as we increase the number of principal components
# np.cumsum is the function to do cummulative sum of the data
# We can see that if we wish to account for around 95% of the variance, we need to keep 150 principal components

fig = px.line(y=np.cumsum(pca.explained_variance_ratio_ * 100))

fig.update_layout(
    xaxis_title="Number of components",
    yaxis_title="Explained variance")

fig.show()

In [None]:
# Lets take the first 10 principal components and see how the resulting image looks like
# To do this, we need to select the 10 principal components and project them back to the 28x28 sized image.

faces_recovered = pca.inverse_transform(faces_proj)
# image_pca_10 = mnist_pca_10_recovered[0].reshape([64,64])

In [None]:
faces_recovered.shape

In [None]:
# Lets take the first 10 principal components and see how the resulting image looks like
# To do this, we need to select the 10 principal components and project them back to the 28x28 sized image.

pca_10 = PCA(n_components=10)
pca_10_reduced = pca_10.fit_transform(df_faces)
pca_10_recovered = pca_10.inverse_transform(pca_10_reduced)
image_pca_10 = pca_10_recovered[0].reshape([64,64])

In [None]:
# Plot the original image and the reconstructed image from the first 10 PCA components

# Create a subplot with 2 rows and 1 column
fig, (ax1, ax2) = plt.subplots(1, 2)

# Display the first image on the first subplot
ax1.imshow(image_original)

# Display the second image on the second subplot
ax2.imshow(image_pca_10)

# Show the subplot
plt.show()

In [None]:
# Lets take the first 150 principal components and see how the resulting image looks like
# To do this, we need to select the 10 principal components and project them back to the 28x28 sized image.

pca_150 = PCA(n_components=150)
pca_150_reduced = pca_150.fit_transform(df_faces)
pca_150_recovered = pca_150.inverse_transform(pca_150_reduced)
image_pca_150 = pca_150_recovered[0,:].reshape([64,64])

In [None]:
# Plot the original image and the reconstructed image from the first 10 and 150 PCA components
# We can clearly see that as the number of PCA components increases, the image clarity also increases
# However, even with 150 components, the reconstructed image quality is not very good.

# Create a subplot with 2 rows and 1 column
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)

# Display the first image on the first subplot
ax1.imshow(image_original)

# Display the second image reconstructed from 10 PCA components on the second subplot
ax2.imshow(image_pca_10)

# Display the second image reconstructed from 150 PCA components on the second subplot
ax3.imshow(image_pca_150)

# Show the subplot
plt.show()

In [None]:
# Lets take the first 300 principal components and see how the resulting image looks like
# To do this, we need to select the 10 principal components and project them back to the 28x28 sized image.

pca_300 = PCA(n_components=300)
pca_300_reduced = pca_300.fit_transform(df_faces)
pca_300_recovered = pca_300.inverse_transform(pca_300_reduced)
image_pca_300 = pca_300_recovered[0,:].reshape([64, 64])

In [None]:
# We can see that when we take 300 PCA components, we get a reasonaly good reconstructed image
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4)
# plt.figure(figsize=(200, 50))
# plt.figure().set_size_inches(10,3)
ax1.axis('off')
ax1.imshow(image_original)
ax2.axis('off')
ax2.imshow(image_pca_10)
ax3.axis('off')
ax3.imshow(image_pca_150)
ax4.axis('off')
ax4.imshow(image_pca_300)

plt.show()