In [49]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [59]:
# Code references: https://www.kaggle.com/code/yairhadad1/detect-outliers-with-pca
# https://www.geeksforgeeks.org/reduce-data-dimentionality-using-pca-python/
# https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html

# Define datasets for two scenarios for comparison
X_wo_outliers = np.array([[1, 1], [2, 2], [3, 3], [4, 4], [5, 5]])
X_outliers = np.array([[1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [12, 16]])
# Scale the datasets
scaler_wo_outliers = StandardScaler()
scaler_outliers = StandardScaler()
X_wo_outliers_scaled = scaler_wo_outliers.fit_transform(X_wo_outliers)
X_outliers_scaled = scaler_outliers.fit_transform(X_outliers)

# Use PCA to fit the scaled datasets and compute explained variance ratio
pca_wo_outliers = PCA()
pca_outliers = PCA()
pca_wo_outliers.fit(X_wo_outliers_scaled)
pca_outliers.fit(X_outliers_scaled)

# Explained Variance Ratio
explained_variance_ratio_wo_outliers = pca_wo_outliers.explained_variance_ratio_
explained_variance_ratio_outliers = pca_outliers.explained_variance_ratio_

# Print Explained Variance Ratios
print("Explained Variance Ratio Without Outliers:\n", explained_variance_ratio_wo_outliers)
print("Explained Variance Ratio With Outliers:\n", explained_variance_ratio_outliers)

# Compute Reconstruction Error
# Transform data to lower dimensions and back to original space
X_wo_outliers_reconstructed = pca_wo_outliers.inverse_transform(pca_wo_outliers.transform(X_wo_outliers_scaled))
X_outliers_reconstructed = pca_outliers.inverse_transform(pca_outliers.transform(X_outliers_scaled))

# Compute Mean Squared Error between original and reconstructed data
reconstruction_error_wo_outliers = mean_squared_error(X_wo_outliers_scaled, X_wo_outliers_reconstructed)
reconstruction_error_outliers = mean_squared_error(X_outliers_scaled, X_outliers_reconstructed)

# Print Reconstruction Errors
print("Reconstruction Error Without Outliers:", reconstruction_error_wo_outliers)
print("Reconstruction Error With Outliers:", reconstruction_error_outliers)


Explained Variance Ratio Without Outliers:
 [1.00000000e+00 6.23711839e-34]
Explained Variance Ratio With Outliers:
 [0.99714028 0.00285972]
Reconstruction Error Without Outliers: 2.465190328815662e-32
Reconstruction Error With Outliers: 9.334741699201884e-32
