# MNIST Dimensionality Reduction

In [None]:
!pip install -r requirements.txt

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
from sklearn.datasets import fetch_openml
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix

## Load MNIST Dataset

In [None]:
mnist = fetch_openml('mnist_784', version=1)
len(mnist)

In [None]:
X, y = mnist.data, mnist.target.astype(int)
X, y = shuffle(X, y, random_state=42)

In [None]:
class_labels = np.unique(y)
class_labels

In [None]:
X.columns

In [None]:
y.head()

## Vizualise MNIST

In [None]:
def plot_mnist_samples(X, y, num_samples=10, 
                       cmap='Greys', figsize=(10,3)):
    fig, axes = plt.subplots(1, num_samples, figsize=figsize)
    for i, ax in enumerate(axes):
        ax.imshow(X[i].reshape(28, 28), cmap=cmap)
        ax.axis('off')
    plt.show()

plot_mnist_samples(X.to_numpy(), y)

## Viz in 2D function

In [None]:
def plot_2D(X_proj, y, 
            title, 
            marker_size=5, 
            col_pal='tab10',
            alpha=0.7, 
            marker_scale=2, **kwargs):
    plt.figure(figsize=(10, 6))
    
    for key in ['s', 'alpha', 'hue', 'palette']:
        kwargs.pop(key, None)
    
    sns.scatterplot(x=X_proj[:, 0], 
                    y=X_proj[:, 1], 
                    hue=y, palette=col_pal, 
                    s=marker_size, 
                    alpha=alpha, 
                    **kwargs)
    
    plt.title(title)
    plt.legend(loc='best', markerscale=marker_scale)
    plt.show()


## Split data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    test_size=0.8, 
                                                    random_state=42)

## Scale

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## PCA for viz in 2D

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train_scaled)
plot_2D(X_pca, 
        y_train, 
        title="PCA on MNIST")

## T-SNE Visualisation in 2D

In [None]:
tsne = TSNE(n_components=2, 
            perplexity=25, 
            learning_rate=200, 
            random_state=42)
X_tsne = tsne.fit_transform(X_train_scaled)  
plot_2D(X_tsne, y_train, "t-SNE on MNIST")

## UMAP for visualisation in 2D

In [None]:
import umap.umap_ as umap
umap_reducer = umap.UMAP(n_components=2, 
                         random_state=42)
X_umap = umap_reducer.fit_transform(X_train_scaled)
plot_2D(X_umap, y_train, "UMAP on MNIST")

## Use K-Means on PCA reductions

In [None]:
def get_optimal_pca(X, variance_threshold=0.95, 
                    figsize=(10, 5), 
                    plot=True,
                    x_lab='No. of PCA Components',
                    y_lab='Cumulative Explained Variance',
                    plot_title='Optimal Explained Variance',
                    optimal_thresh_line_color='grey',
                    variance_line_color='black',
                    linestyle='--',
                    **pca_args):
    
    pca = PCA(n_components=X.shape[1], **pca_args)
    pca_transformed = pca.fit_transform(X)
    explained_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
    optimal_components = np.argmax(explained_variance_ratio >= variance_threshold) + 1
    
    # Plot results
    if plot:
        plt.figure(figsize=figsize)
        plt.plot(range(1, len(explained_variance_ratio) + 1), 
                 explained_variance_ratio, 
                 marker='o', 
                 linestyle='--', 
                 color='b')
        plt.axhline(y=variance_threshold, 
                    color=variance_line_color, 
                    linestyle=linestyle, 
                    label=f"{variance_threshold * 100}% Variance")
        plt.axvline(x=optimal_components, 
                    color=optimal_thresh_line_color, 
                    linestyle='--', 
                    label=f"Optimal Components: {optimal_components}")
        plt.xlabel(x_lab)
        plt.ylabel(y_lab)
        plt.title(plot_title)
        plt.legend()
        plt.grid()
        plt.show()
    
    return (optimal_components, explained_variance_ratio, 
            pca_transformed)



In [None]:
(opt_comp, explained_variance_ratio, _) = get_optimal_pca(
     X_train_scaled, 
    variance_threshold=0.9)

## Fit PCA using optimal components

In [None]:
pca_optim = PCA(n_components=opt_comp)
X_train_pca = pca_optim.fit_transform(X_train_scaled)
X_test_pca = pca_optim.transform(X_test_scaled)

## Train a Support Vector Machine with a Radial Basis Kernel

In [None]:
from sklearn.svm import SVC

## Train model on PCA reduced data

In [None]:
svm_model = SVC(kernel="rbf", C=10, gamma=0.01)
svm_model.fit(X_train_pca, y_train)

## Predict on test set

In [None]:
y_pred = svm_model.predict(X_test_pca)

## Visualize results

In [None]:
from modelviz.confusion_matrix import plot_confusion_matrix
cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
plot_confusion_matrix(cm=cm, classes=class_labels,
                      model_name='MNIST Digit Accuracy', 
                      label_fontsize=8, annot_fontsize=8,
                      cell_fontsize=8, table_fontsize=8, 
                      cmap='Greys',
                      proportions_color='grey')