**Redução de Dimensionalidade**

**PCA**

Construção de um dataset 3D

In [1]:
import numpy as np

np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)


**Componentes Principais**

In [2]:
X_centered = X - X.mean(axis=0)
U, s, Vt = np.linalg.svd(X_centered)

c1 = Vt.T[:, 0]
c2 = Vt.T[:, 1]

**c1 e c2** são equivalentes as duas primeiras componentes principais, ou seja, c1 é o vetor de dados que possui a maior variância, c2 é o eixo ortogonal a c1, sendo ela a segunda componente com maior variância.

In [3]:
c1, c2

(array([0.93636116, 0.29854881, 0.18465208]),
 array([-0.34027485,  0.90119108,  0.2684542 ]))

**Projetando para d dimensões**

In [4]:
W2 = Vt.T[:, :2]

X2D = X_centered.dot(W2)

In [5]:
X2D

array([[-1.26203346, -0.42067648],
       [ 0.08001485,  0.35272239],
       [-1.17545763, -0.36085729],
       [-0.89305601,  0.30862856],
       [-0.73016287,  0.25404049],
       [ 1.10436914, -0.20204953],
       [-1.27265808, -0.46781247],
       [ 0.44933007, -0.67736663],
       [ 1.09356195,  0.04467792],
       [ 0.66177325,  0.28651264],
       [-1.04466138,  0.11244353],
       [ 1.05932502, -0.31189109],
       [-1.13761426, -0.14576655],
       [-1.16044117, -0.36481599],
       [ 1.00167625, -0.39422008],
       [-0.2750406 ,  0.34391089],
       [ 0.45624787, -0.69707573],
       [ 0.79706574,  0.26870969],
       [ 0.66924929, -0.65520024],
       [-1.30679728, -0.37671343],
       [ 0.6626586 ,  0.32706423],
       [-1.25387588, -0.56043928],
       [-1.04046987,  0.08727672],
       [-1.26047729, -0.1571074 ],
       [ 1.09786649, -0.38643428],
       [ 0.7130973 , -0.64941523],
       [-0.17786909,  0.43609071],
       [ 1.02975735, -0.33747452],
       [-0.94552283,

**Usando Scikit-Learn**

In [6]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)

In [7]:
pca.components_

array([[-0.93636116, -0.29854881, -0.18465208],
       [ 0.34027485, -0.90119108, -0.2684542 ]])

In [8]:
pca.components_.T[:, 0]

array([-0.93636116, -0.29854881, -0.18465208])

In [9]:
X2D

array([[ 1.26203346,  0.42067648],
       [-0.08001485, -0.35272239],
       [ 1.17545763,  0.36085729],
       [ 0.89305601, -0.30862856],
       [ 0.73016287, -0.25404049],
       [-1.10436914,  0.20204953],
       [ 1.27265808,  0.46781247],
       [-0.44933007,  0.67736663],
       [-1.09356195, -0.04467792],
       [-0.66177325, -0.28651264],
       [ 1.04466138, -0.11244353],
       [-1.05932502,  0.31189109],
       [ 1.13761426,  0.14576655],
       [ 1.16044117,  0.36481599],
       [-1.00167625,  0.39422008],
       [ 0.2750406 , -0.34391089],
       [-0.45624787,  0.69707573],
       [-0.79706574, -0.26870969],
       [-0.66924929,  0.65520024],
       [ 1.30679728,  0.37671343],
       [-0.6626586 , -0.32706423],
       [ 1.25387588,  0.56043928],
       [ 1.04046987, -0.08727672],
       [ 1.26047729,  0.1571074 ],
       [-1.09786649,  0.38643428],
       [-0.7130973 ,  0.64941523],
       [ 0.17786909, -0.43609071],
       [-1.02975735,  0.33747452],
       [ 0.94552283,

**Razão da variância**

In [10]:
pca.explained_variance_ratio_

array([0.84248607, 0.14631839])

**Escolhendo o melhor número de dimensões**

In [11]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.target = mnist.target.astype(np.uint8)


In [12]:
from sklearn.model_selection import train_test_split

X = mnist["data"]
y = mnist["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y)


In [13]:
pca = PCA()
pca.fit(X_train)

cumsum = np.cumsum(pca.explained_variance_ratio_)

d = np.argmax(cumsum >= 0.95) + 1

In [14]:
cumsum

array([0.09719832, 0.16875148, 0.23046024, 0.28447767, 0.33353621,
       0.37656401, 0.40934646, 0.43819275, 0.46567853, 0.48924485,
       0.51032629, 0.5307285 , 0.54778858, 0.56465048, 0.58041792,
       0.59534958, 0.60862878, 0.62147783, 0.63334578, 0.64479193,
       0.65545804, 0.66555448, 0.67514241, 0.68416896, 0.69296211,
       0.70131513, 0.70939893, 0.71727437, 0.72468736, 0.73157212,
       0.73812949, 0.74459959, 0.75058197, 0.75643475, 0.76210811,
       0.7675608 , 0.77261474, 0.77750626, 0.78230885, 0.78696883,
       0.79152081, 0.79597374, 0.80014325, 0.80411726, 0.80795962,
       0.81171266, 0.81533146, 0.81882001, 0.8221978 , 0.82541301,
       0.82859763, 0.83168676, 0.83465474, 0.83752128, 0.84034773,
       0.84303834, 0.84571815, 0.84828932, 0.85083357, 0.85329543,
       0.85569464, 0.8580644 , 0.86035289, 0.86256245, 0.86468703,
       0.86673787, 0.86875875, 0.87071732, 0.87263867, 0.87451472,
       0.87637902, 0.87817437, 0.87993644, 0.88167564, 0.88332

**OU**

In [15]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)