## 1. Import libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.datasets import mnist

## 2. Load MNIST data from Keras

In [None]:
# Load training and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Combine into one dataset
X = np.vstack([x_train.reshape(len(x_train), -1), x_test.reshape(len(x_test), -1)])
y = np.concatenate([y_train, y_test])

print("Data shape:", X.shape)   # Expect (70000, 784)
print("Labels shape:", y.shape)

## 3. Standardize the data

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## 4. Apply PCA

In [None]:
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

explained_variance = pca.explained_variance_ratio_ * 100
print(f"Explained variance by PC1: {explained_variance[0]:.2f}%")
print(f"Explained variance by PC2: {explained_variance[1]:.2f}%")
print(f"Total variance (PC1 + PC2): {explained_variance.sum():.2f}%")

## 5. Scatter plot with colored labels

In [None]:
sample_size = 10000
rng = np.random.RandomState(42)
idx = rng.choice(X_pca.shape[0], sample_size, replace=False)

plt.figure(figsize=(10, 8))
for digit in range(10):
    sel = y[idx] == digit
    plt.scatter(X_pca[idx][sel, 0], X_pca[idx][sel, 1], s=10, alpha=0.6, label=str(digit))

plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title(f"PCA Projection of MNIST (Variance: {explained_variance[0]:.2f}%, {explained_variance[1]:.2f}%)")
plt.legend(title="Digit", markerscale=2)
plt.grid(alpha=0.3)
plt.show()