In [None]:
import time

import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.manifold import TSNE, MDS, LocallyLinearEmbedding
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

from utils import open_mnist_or_download_if_missing, shuffle_split

%matplotlib inline

RS = 33

In [None]:
mnist = open_mnist_or_download_if_missing()

In [None]:
X_train, y_train, X_test, y_test = shuffle_split(mnist["data"], mnist["target"], 60000, RS)

In [None]:
def time_action(func, *args, **kwargs):
    start_time = time.time()
    func(*args, **kwargs)
    return time.time() - start_time

def time_fit(clf, X, y):
    return time_action(clf.fit, X, y)

In [None]:
clf = RandomForestClassifier()
time_fit(clf, X_train, y_train)

In [None]:
accuracy_score(y_test, clf.predict(X_test))

So random forest classifier took 25 seconds to fit with resulting accuracy of 0.9669

Now let's see how it will do with PCA with explained variance ratio of 95%

In [None]:
pipeline = make_pipeline(PCA(n_components=0.95), RandomForestClassifier())
time_fit(pipeline, X_train, y_train)

In [None]:
accuracy_score(y_test, pipeline.predict(X_test))

So, it is way slover, took more then 73 seconds and accuracy is a bit lower (expected I guess) - 0.9446

# 10

In [None]:
def reduce_and_plot(reductor):
    
    X2d = reductor.fit_transform(X_test)
    
    norm = Normalize()

    plt.figure(figsize=(15, 15))
    scatter = plt.scatter(X2d[:, 0], X2d[:, 1], c=y_test, norm=norm, cmap="gist_rainbow")
    plt.legend(*scatter.legend_elements())

In [None]:
reduce_and_plot(TSNE())

In [None]:
reduce_and_plot(PCA())

In [None]:
reduce_and_plot(MDS())

In [None]:
reduce_and_plot(LocallyLinearEmbedding())