In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt

## Fake datasets

In [36]:
X = np.zeros((100, 5))  # initialize 3D dataset
np.random.seed(42)
angles = (np.random.rand(100) ** 3 + 0.5) * 2 * np.pi  # uneven distribution
X[:, 0], X[:, 1] = np.cos(angles), np.sin(angles) * 0.5  # oval
X += 0.28 * np.random.randn(100, 5)  # add more noise
X += [0.3, 0.2, 0, 0.2, 0.3]  # shift a bit

In [12]:
X

array([[-9.46002697e-01, -1.62079377e-01,  0.00000000e+00],
       [-6.34079662e-01,  3.86633865e-01,  0.00000000e+00],
       [ 7.79303347e-01, -3.13323432e-01,  0.00000000e+00],
       [-2.20874743e-01, -4.87651091e-01,  0.00000000e+00],
       [-9.99715313e-01, -1.19299363e-02,  0.00000000e+00],
       [-9.99715577e-01, -1.19244052e-02,  0.00000000e+00],
       [-9.99999242e-01, -6.15617014e-04,  0.00000000e+00],
       [ 5.88505404e-01,  4.04246642e-01,  0.00000000e+00],
       [-2.04593045e-01, -4.89423560e-01,  0.00000000e+00],
       [ 6.12927235e-01, -3.95069679e-01,  0.00000000e+00],
       [-9.99999998e-01, -2.74012565e-05,  0.00000000e+00],
       [-8.52372700e-01,  2.61467388e-01,  0.00000000e+00],
       [ 8.85669088e-01,  2.32158494e-01,  0.00000000e+00],
       [-9.98191249e-01, -3.00592334e-02,  0.00000000e+00],
       [-9.99286820e-01, -1.88802250e-02,  0.00000000e+00],
       [-9.99248835e-01, -1.93763131e-02,  0.00000000e+00],
       [-9.84386028e-01, -8.80115733e-02

In [37]:
X.shape

(100, 5)

# PCA

In [40]:
X_centered = X - X.mean(axis=0)

In [41]:
U, sigma, Vt = np.linalg.svd(X_centered)

In [47]:
X_centered @ Vt[:3].T

array([[-4.53072199e-01, -1.71836815e-01, -5.64722849e-01],
       [-7.05798496e-02,  8.34514096e-01, -2.69022768e-01],
       [ 1.43711614e+00, -1.58990712e-01,  3.06259823e-01],
       [ 4.60426226e-01, -5.00877946e-01, -7.77772789e-02],
       [-4.79769879e-01,  2.50588239e-01, -1.67609776e-01],
       [-6.92418120e-01, -5.62114235e-02, -2.54856930e-02],
       [-3.82560858e-02,  2.77543075e-01, -1.25011498e-01],
       [ 1.05875291e+00,  5.91013847e-01, -6.99911727e-02],
       [ 1.95151764e-01, -6.28622899e-01,  2.77525751e-01],
       [ 7.60822392e-01, -1.06696503e-01,  4.04245794e-01],
       [-8.38545379e-01,  5.52381464e-02, -2.66229382e-01],
       [-3.71430535e-01, -2.01234794e-02, -2.04400647e-01],
       [ 1.13443881e+00,  1.97119789e-01, -2.64502233e-01],
       [-2.14195310e-01, -3.71740769e-01,  1.29222263e-01],
       [-9.19403012e-01, -2.14774540e-01,  3.61608274e-02],
       [-4.79080614e-01, -1.55588675e-02,  2.03393018e-02],
       [-6.09577169e-02,  1.62614828e-01

## Using Scikit-Learn

In [48]:
from sklearn.decomposition import PCA

In [49]:
pca = PCA(n_components=3)

In [50]:
X_transformed = pca.fit_transform(X)

In [52]:
X_transformed.shape

(100, 3)

In [53]:
pca.explained_variance_

array([0.56495765, 0.14848473, 0.08020008])

In [54]:
pca.explained_variance_ratio_

array([0.61595292, 0.16188754, 0.08743925])

In [55]:
from sklearn.datasets import fetch_openml

In [56]:
mnist = fetch_openml('mnist_784', as_frame=False)

  warn(


In [58]:
X_train, y_train = mnist.data[:60000], mnist.target[:60000]

In [59]:
X_test, y_test = mnist.data[60000:], mnist.target[60000:]

In [60]:
pca = PCA()

In [61]:
pca.fit(X_train)

# Sklearn Pipeline

In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline

In [70]:
clf = make_pipeline(PCA(random_state=42), RandomForestClassifier(random_state=42))

In [71]:
param_dist = {
    'pca__n_components': np.arange(10, 80),
    'randomforestclassifier__n_estimators': np.arange(50, 5000)
}

In [72]:
rnd_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=10, cv=3, random_state=42)

In [None]:
rnd_search.fit(X_train[:1000], y_train[:1000])

In [None]:
print(rnd_search.best_params_)