# 降维

## 什么是PCA里的可解释方差

参考 https://ro-che.info/articles/2017-12-11-pca-explained-variance

### 准备一组数据, 横列为样本数, 纵列为变量

In [2]:
import numpy as np

a = np.array([[-0.42298398,  1.7352837,  0.4119150],
[-1.54987816, -0.2647112, -0.6524724],
[-0.06442932,  2.0994707,  0.7603068],
[ 0.27088135,  0.8633512,  0.1551048]])

a

array([[-0.42298398,  1.7352837 ,  0.411915  ],
       [-1.54987816, -0.2647112 , -0.6524724 ],
       [-0.06442932,  2.0994707 ,  0.7603068 ],
       [ 0.27088135,  0.8633512 ,  0.1551048 ]])

In [40]:
# 在0轴纵列上计算每个变量的方差

sample_vars = np.var(a, axis=0, ddof=1) # numpy里的var默认是均方差, 除以 N - ddof(=0), 方差是除以N-1, 所以设ddof为1

print("每个变量的样本偏差: ", sample_vars)

# 样本方差之和是total variance, 总方差
total_var = np.sum(sample_vars)
print("样本总方差: ", total_var)

# 每个样本方差除以总方差既为每个变量的可解释方差
explained_vars = sample_vars/total_var
print("变量的可解释方差: ", explained_vars)

每个变量的样本偏差:  [0.62617148 1.10689586 0.36122036]
样本总方差:  2.0942876968141357
变量的可解释方差:  [0.29899019 0.52853095 0.17247886]


### 测试下sklearn里的PCA计算结果

In [54]:

from sklearn.decomposition import PCA
pca = PCA() # 不降维的情况下计算PCA

b = pca.fit_transform(a)
print("方差解释率: ", pca.explained_variance_ratio_) # 前2个主成分占了99%的总方差


b_vars = np.var(b, axis=0, ddof=1)
print("转换后的样本变量方差: ", b_vars)

b_vars = np.sum(b_vars) # 转换后的总方差和原来一致
print("转换后的总方差: ", b_vars)


方差解释率:  [0.88235567 0.11674697 0.00089736]
转换后的样本变量方差:  [1.84790662 0.24450174 0.00187933]
转换后的总方差:  2.0942876968141375


## 核PCA Kernel PCA

In [62]:
# 准备瑞士卷数据集
from sklearn.datasets import make_swiss_roll
from sklearn.model_selection import train_test_split
import numpy as np

X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)
y = t > 6.9
print("X shape: ", X.shape)

X shape:  (1000, 3)


### 使用随机网格搜索寻找逻辑回归问题下的最佳核PCA超参数

In [64]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.decomposition import KernelPCA

pipeline = Pipeline([
    ("kpca", KernelPCA(n_components=2)),
    ("log_req", LogisticRegression(solver="lbfgs"))
])

param_grid = [{
    "kpca__kernel": ["rbf", "sigmoid"], 
    # kpca__kernel 里的kpca对应pipeline里的kpca,意思是找pipeline里名字为kpca的transformer的kernel参数
    "kpca__gamma": np.linspace(0.03, 0.05, 10)
}]

grid_search = GridSearchCV(pipeline, param_grid, cv=3)
grid_search.fit(X, y)

grid_search.best_params_

{'kpca__gamma': 0.043333333333333335, 'kpca__kernel': 'rbf'}

### <font color=red>使用交叉验证的方格搜索来寻找可以最小化重建前图像误差的核方法和超参数</font>

### LLE局部线性嵌入

In [81]:
from sklearn.manifold import LocallyLinearEmbedding

lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)

X_reduced = lle.fit_transform(X)

X_reduced.shape

(1000, 2)