# MNIST PCA Analysis

This notebook performs Principal Component Analysis (PCA) on a CSV file found in the current directory. It creates: 

- A bar plot of variance explained by each principal component.
- A combined bar + cumulative variance plot.
- A grid of scatter plots visualizing relationships between pairs of principal components (PC1 vs PC2, PC3 vs PC4, ...).

The notebook automatically detects the first `.csv` file in the current directory. If you want to target a specific filename, edit the `csv_file` variable in the first code cell.

In [None]:
# Imports and CSV detection
import os, glob
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Find the first CSV file in the current directory
csv_files = sorted(glob.glob("*.csv"))
if len(csv_files) == 0:
    raise FileNotFoundError("No CSV files found in current directory. Please place your MNIST CSV file here.")

csv_file = csv_files[0]
print(f"Using CSV file: {csv_file}")

# Quick peek - do not load entire file yet if huge
df = pd.read_csv(csv_file)
print('DataFrame shape:', df.shape)
df.head()

In [None]:
# Prepare data: assume each row is an image flattened to 784 columns (or more generally numeric pixels)
# If there's a 'label' column we drop it for this unsupervised PCA.
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# If a label column exists but named 'label' or 'Label', drop it automatically.
possible_label_cols = [c for c in numeric_cols if c.lower() in ('label', 'labels', 'target', 'y')]
if possible_label_cols:
    print('Dropping possible label columns:', possible_label_cols)
    numeric_cols = [c for c in numeric_cols if c not in possible_label_cols]

X = df[numeric_cols].values.astype(float)
print('Feature matrix shape:', X.shape)

# Normalize pixel values if range suggests image pixels (0-255)
if X.max() > 1.0:
    X = X / 255.0
    print('Normalized X by dividing by 255.')

# If flattened but user wants image-form, they can reshape later:
# images = X.reshape(-1, 28, 28)  # if appropriate


In [None]:
# Perform PCA
n_components = min(100, X.shape[1])  # compute up to 100 PCs or number of features, whichever smaller
pca = PCA(n_components=n_components, svd_solver='randomized', random_state=42)
pca.fit(X)

explained_variance = pca.explained_variance_ratio_
print('Explained variance shape:', explained_variance.shape)

# Bar plot of explained variance per PC
plt.figure(figsize=(12,5))
plt.bar(range(1, len(explained_variance)+1), explained_variance)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance Ratio by Principal Component')
plt.tight_layout()
plt.show()

In [None]:
# Bar plot + Cumulative variance curve
cum_variance = np.cumsum(explained_variance)

fig, ax1 = plt.subplots(figsize=(12,6))
ax1.bar(range(1, len(explained_variance)+1), explained_variance, label='Explained Variance')
ax1.set_xlabel('Principal Component')
ax1.set_ylabel('Explained Variance Ratio')
ax1.set_title('Explained Variance and Cumulative Explained Variance')
ax1.set_ylim(0, max(explained_variance)*1.2)

ax2 = ax1.twinx()
ax2.plot(range(1, len(cum_variance)+1), cum_variance, marker='o', linestyle='-', label='Cumulative Variance')
ax2.set_ylabel('Cumulative Explained Variance Ratio')
ax2.set_ylim(0,1.05)

# Legends
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines + lines2, labels + labels2, loc='upper left')

plt.tight_layout()
plt.show()

# Print number of components needed for common thresholds
for thr in [0.5, 0.75, 0.90, 0.95, 0.99]:
    k = np.searchsorted(cum_variance, thr) + 1
    print(f"Number of components to explain {int(thr*100)}% variance: {k}")

In [None]:
# Scatter plots of PC pairs (PC1 vs PC2, PC3 vs PC4, ...)
# Project data to first few PCs for visualization
num_plot_pcs = min(8, X.shape[1])  # up to first 8 PCs
X_pca = pca.transform(X)[:, :num_plot_pcs]

# Determine grid size (pairs)
pairs = []
for i in range(0, num_plot_pcs, 2):
    if i+1 < num_plot_pcs:
        pairs.append((i, i+1))

n_pairs = len(pairs)
cols = 2
rows = (n_pairs + cols - 1) // cols
fig, axes = plt.subplots(rows, cols, figsize=(6*cols, 4*rows))
axes = axes.flatten()

for idx, (i, j) in enumerate(pairs):
    ax = axes[idx]
    ax.scatter(X_pca[:, i], X_pca[:, j], s=8, alpha=0.6)
    ax.set_xlabel(f'PC{i+1}')
    ax.set_ylabel(f'PC{j+1}')
    ax.set_title(f'PC{i+1} vs PC{j+1}')
    
# Turn off any unused subplots
for k in range(len(pairs), len(axes)):
    fig.delaxes(axes[k])

plt.tight_layout()
plt.show()

# If you want to color points by a label (if available), provide label array 'y' and use ax.scatter(..., c=y)


## Next steps & Tips

- If you have labels for a subset of the dataset, color the scatter plots by label to see which clusters correspond to which digits.
- Consider using **t-SNE** or **UMAP** for improved 2D visualization of complex manifolds.
- For better clustering, train an **autoencoder** and cluster the latent space.
- To save the PCA model and reuse it: use `joblib.dump(pca, 'pca_model.joblib')`.

If you'd like, I can modify this notebook to target a specific filename (instead of picking the first CSV), or add t-SNE/UMAP and clustering cells automatically.
