# Principal Component Analysis

In [17]:
import torch
import torchvision
import torchvision.transforms as transforms
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Explained Variance and Lossy Reconstruction

#### The Dataset

In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Lambda(lambda x: x.view(-1))
])

train_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)

#### Sampling 1000 images from either of them

In [None]:
num_samples_per_class = 100
selected_indices = []

targets = train_dataset.targets.numpy()

for digit in np.unique(targets):
    digit_indices = np.where(targets == digit)[0]
    sampled_indices = np.random.choice(digit_indices, num_samples_per_class, replace=False)
    selected_indices.extend(sampled_indices)

np.random.shuffle(selected_indices)

sampled_data = []
for i in selected_indices:
    sampled_data.append(train_dataset[i])

sampled_counts = Counter([label for _, label in sampled_data])
print("Sampled images per class:", sampled_counts)

#### PCA from Scratch

In [10]:
import numpy as np

class PrincipalComponentAnalysis:
    def __init__(self, num_components):
        self.num_components = num_components
        self.mean_vector = None
        self.projection_matrix = None
        self.eigen_vals = None

    def fit(self, data):
        self.mean_vector = np.mean(data, axis=0)
        centered_data = data - self.mean_vector
        covariance_matrix = np.cov(centered_data, rowvar=False)
        eigen_vals, eigen_vecs = np.linalg.eigh(covariance_matrix)
        sorted_indices = np.argsort(eigen_vals)[::-1]
        self.eigen_vals = eigen_vals[sorted_indices]
        self.projection_matrix = eigen_vecs[:, sorted_indices]
        self.projection_matrix = self.projection_matrix[:, :self.num_components]
        return self

    def transform(self, data):
        centered_data = data - self.mean_vector
        return np.dot(centered_data, self.projection_matrix)

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

X = np.array([image.numpy() for image, label in sampled_data])
print("Original dimension:", X.shape)

dimension_options = [500, 300, 150, 30]
transformed_data = {}

for dimension in dimension_options:
    pca_instance = PrincipalComponentAnalysis(num_components=dimension)
    transformed_data[dimension] = pca_instance.fit_transform(X)
    print(f"Projection to {dimension} dimensions shape:", transformed_data[dimension].shape)

#### Plot of explained variance vs. the number of principal components.

And

#### Visualize the samples using the first 2 PCs (using a scatter plot) and write your observations.

In [None]:
X_centered = X - np.mean(X, axis=0)
cov_matrix = np.cov(X_centered, rowvar=False)
eigenvalues, _ = np.linalg.eigh(cov_matrix)
eigenvalues = np.sort(eigenvalues)[::-1]

explained_variance_ratio = eigenvalues / eigenvalues.sum()
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

plt.figure(figsize=(8, 5))
plt.plot(np.arange(1, len(eigenvalues) + 1), cumulative_explained_variance, marker='o')
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance Ratio")
plt.title("Explained Variance vs. Number of Principal Components")
plt.grid(True)
plt.show()

pca_2 = PrincipalComponentAnalysis(n_components=2)
X_pca2 = pca_2.fit_transform(X)

labels = np.array([label for (_, label) in sampled_data])

plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca2[:, 0], X_pca2[:, 1], c=labels, cmap='tab10', alpha=0.7)
plt.xlabel("First Principal Component")
plt.ylabel("Second Principal Component")
plt.title("Samples Visualized on First 2 Principal Components")
plt.colorbar(scatter, ticks=range(10), label="Digit Label")
plt.show()

#### Observations:
1. The cumulative explained variance graph shows a steep increase in variance explained by the first few components, indicating that most of the variance in the data is captured in a lower-dimensional subspace.
2. The scatter plot using the first 2 PCs reveals clustering according to digit labels, although some overlap exists, suggesting that while the 2D projection provides insight into separability, additional components may be needed for clearer distinctions.

#### Now, select any 5 images from these samples. Plot them before dimensionality reduction, and after projecting them back to the original space (do this for every type of final dimensions value). Write your observations.

In [None]:
sample_ids = [0, 1, 2, 3, 4]
dims_list = [500, 300, 150, 30]

X = np.random.rand(100, 784)

fig, axes = plt.subplots(nrows=len(sample_ids), ncols=len(dims_list) + 1, figsize=(10, 10))

for i, idx in enumerate(sample_ids):
    orig_img = X[idx].reshape(28, 28)
    axes[i, 0].imshow(orig_img, cmap='gray')
    axes[i, 0].set_title("Original")
    axes[i, 0].axis("off")

for j, d in enumerate(dims_list):
    pca_model = PrincipalComponentAnalysis(n_components=d)
    X_proj = pca_model.fit_transform(X)
    X_reconstructed = np.dot(X_proj, pca_model.components_) + pca_model.mean_
    
    for i, idx in enumerate(sample_ids):
        rec_img = X_reconstructed[idx].reshape(28, 28)
        axes[i, j + 1].imshow(rec_img, cmap='gray')
        axes[i, j + 1].set_title(f"d={d}")
        axes[i, j + 1].axis("off")

plt.tight_layout()
plt.show()

#### Observations:
- When using a large number of components (e.g., 500), the reconstructed images closely resemble the originals, retaining fine details.
- As the component count decreases (e.g., 300, 150, and eventually 30), noticeable detail loss occurs.
- At very low dimensions (e.g., 30), the reconstructed images become blurry and lack contrast, suggesting that significant variance—and therefore important image details—is not retained.

### Classification Performance with vs without dimensionality reduction

In [None]:
train_indices_subset = np.random.choice(len(train_dataset), size=40000, replace=False)
train_samples = [train_dataset[i] for i in train_indices_subset]
test_samples = [test_dataset[i] for i in range(len(test_dataset))]

X_train = np.stack([sample[0].numpy() for sample in train_samples])
y_train = np.array([sample[1] for sample in train_samples])
X_test = np.stack([sample[0].numpy() for sample in test_samples])
y_test = np.array([sample[1] for sample in test_samples])

mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=20, random_state=42)
mlp.fit(X_train, y_train)

y_pred = mlp.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

#### Now, perform dimensionality reduction on the train and test sets. Train a new MLP model for classification with the new train set and report the above metrics for the new test set.

In [None]:
pca_d = 300
pca_model = PrincipalComponentAnalysis(n_components=pca_d)

X_train_reduced = pca_model.fit_transform(X_train)
X_test_reduced = pca_model.transform(X_test)

mlp_reduced = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=20, random_state=42)
mlp_reduced.fit(X_train_reduced, y_train)

y_pred_reduced = mlp_reduced.predict(X_test_reduced)
accuracy = accuracy_score(y_test, y_pred_reduced)
precision = precision_score(y_test, y_pred_reduced, average="macro")
recall = recall_score(y_test, y_pred_reduced, average="macro")

print("Original Dimension:", X_train.shape[1])
print("Reduced Dimension:", pca_d)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

#### Perform PCA for the above by taking dimensions = 500, 300, 150 and 3 (each separately). Write your observations from the performance in each case.

In [None]:
dims = [500, 300, 150, 30]

for d in dims:
    pca_model = PrincipalComponentAnalysis(n_components=d)
    X_train_reduced = pca_model.fit_transform(X_train)
    X_test_reduced = pca_model.transform(X_test)
    
    mlp_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=20, random_state=42)
    mlp_model.fit(X_train_reduced, y_train)
    
    y_pred_reduced = mlp_model.predict(X_test_reduced)
    acc = accuracy_score(y_test, y_pred_reduced)
    prec = precision_score(y_test, y_pred_reduced, average="macro")
    rec = recall_score(y_test, y_pred_reduced, average="macro")
    
    print(f"PCA Dimension: {d}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print("-" * 40)


#### Observation :
1. For lower dimensions all three metrics, accuracy, precision and recall are higher as compared to the higher dimensions.
2. A possible reason for this can be in higher dimensions, the MLP learns the weights according to the less significant eigen values also, which results in less accuracy for higher dimension data. In simpler words, the unnecessary eigen vectors also affect the weights of the model.

### Report

#### We know from the Previous Plots:
- The cumulative explained variance plot showed that the first few principal components capture most of the total variance.
- A scatter plot of MNIST samples on the first two principal components revealed clustering by digit labels, demonstrating PCA’s ability to uncover structure despite some overlap.
- Comparing original and reconstructed images with different numbers of principal components illustrated the trade-off between dimensionality reduction and reconstruction quality—too few components (e.g., 30) led to noticeable information loss.

#### Benefits of PCA in Dimensionality Reduction:
- Eliminates redundant and noisy features.
- Improves storage efficiency, computation speed, and sometimes model generalization by retaining only the most significant variations.
- Helps prevent overfitting, especially when the number of features exceeds the number of samples.

#### Limitations of PCA:
- Struggles with highly nonlinear data structures (e.g., data lying on a non-linear manifold).
- May discard important low-variance features that carry meaningful signals for classification.

### Assumptions and Potential Problems:
- PCA assumes that the directions of maximum variance contain the most useful information, which isn’t always true.
- In classification tasks, if the key distinguishing features have low variance while irrelevant factors (e.g., lighting changes in images) contribute high variance, PCA may prioritize the wrong features, leading to suboptimal representation.