# Single Value Decomposition / PCA

In [1]:
import numpy as np

In [2]:
# make fake data
np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles) / 2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles ) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

In [3]:
# decompose the data and extract the pincipal components
X_centered = X - X.mean(axis=0)
U, s, Vt = np.linalg.svd(X_centered)
c1 = Vt.T[:, 0]  # PC1
c2 = Vt.T[:, 1]  # PC2 

In [4]:
# project the data onto the plane of the first 2 principal components
W2 = Vt.T[:, :2]
X2D = X_centered.dot(W2)

In [5]:
# using sklearn
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X2D = pca.fit_transform(X)

In [6]:
# explained variance
pca.explained_variance_ratio_

array([0.84248607, 0.14631839])

In [7]:
# import mnist dataset
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.target = mnist.target.astype(np.uint8)

In [8]:
# split the set
from sklearn.model_selection import train_test_split
X = mnist["data"]
y = mnist["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [9]:
# find number of components needed to explain 95% of the variance
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

In [10]:
# alternatively, the n_components argument can keep as many components as need to explain a certain ratio of variance
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)

In [11]:
# PCA can be used to compress/decompress a dataset. During decompression, some data is lost
pca = PCA(n_components=154)
X_reduced = pca.fit_transform(X_train)
X_recovered = pca.inverse_transform(X_reduced)

# Incremental PCA

In [12]:
from sklearn.decomposition import IncrementalPCA

In [13]:
# incremental PCA with the whole dataset loaded to disk anyway
n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_train, n_batches):
    inc_pca.partial_fit(X_batch)

X_reduced = inc_pca.transform(X_train)

In [14]:
# incremental PCA loading batches of the dataset. The array should be accessible from a binary file
# X_mm = np.memmap(filename, dtype='float32', mode='readonly', shape=(m, n))
# batch_size = m // n_batches
# inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)
# inc_pca.fit(X_mm)

# Kernel PCA

In [15]:
from sklearn.decomposition import KernelPCA

In [16]:
# this cell crashes because of the Jupyter kernel (unknown reason)
# rbf_pca = KernelPCA(n_components=2, kernel='rbf', gamma=0.04)
# X_reduced = rbf_pca.fit_transform(X)

# Exploring Best Hyperparameters

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [19]:
# make a pipeline that applies kPCA and classifies with a logistic regressor
# this line crashes because of kPCA (unknown reason)
# clf = Pipeline([
#     ('kpca', KernelPCA(n_components=2)),
#     ('log_reg', LogisticRegression())
#     ])

# param_grid = [{
#     'kpca__gamma': np.linspace(0.03, 0.05, 10),
#     'kpca__kernel': ['rbf', 'sigmoid']
# }]

# grid_search = GridSearchCV(clf, param_grid=param_grid, cv=3)
# grid_search.fit(X, y)
# print(grid_search.best_params_)

: 

: 

# LLE

In [2]:
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.datasets import make_swiss_roll

In [3]:
X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=41)

In [4]:
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)
X_reduced = lle.fit_transform(X)