# PCA on MNIST (CSV from Kaggle)
This notebook applies PCA on the 28×28 (784-dimensional) MNIST handwritten digit data from `mnist_train.csv`, reduces it to 2 dimensions, calculates the variance explained by the two principal components, and visualizes the projection with class labels colored.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
# Load mnist_train.csv (ensure it's in the same directory or provide path)
df = pd.read_csv('mnist_train.csv')

# First column is label, rest are pixel values
y = df.iloc[:, 0].values
X = df.iloc[:, 1:].values

print('Data shape:', X.shape)
print('Labels shape:', y.shape)
print('Unique labels:', np.unique(y))

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print('Scaled data shape:', X_scaled.shape)

In [None]:
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

explained_variance = pca.explained_variance_ratio_ * 100
print(f'Explained variance by PC1: {explained_variance[0]:.2f}%')
print(f'Explained variance by PC2: {explained_variance[1]:.2f}%')
print(f'Total variance (PC1 + PC2): {explained_variance.sum():.2f}%')

In [None]:
# Optional: sample subset for faster plotting
sample_size = 5000
if X_pca.shape[0] > sample_size:
    rng = np.random.RandomState(42)
    idx = rng.choice(X_pca.shape[0], sample_size, replace=False)
else:
    idx = np.arange(X_pca.shape[0])

plt.figure(figsize=(10, 8))
for digit in np.unique(y):
    sel = y[idx] == digit
    plt.scatter(X_pca[idx][sel, 0], X_pca[idx][sel, 1], s=10, alpha=0.6, label=str(digit))

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title(f'PCA Projection of MNIST (PC1: {explained_variance[0]:.2f}%, PC2: {explained_variance[1]:.2f}%)')
plt.legend(title='Digit', markerscale=2)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()