In [1]:
import numpy as np
import pandas as pd  

#### 特征值分解

In [5]:
A = np.mat([
    [1., -3.,  3.],
    [3., -5.,  3.],
    [6., -6.,  4.]
])

In [6]:
np.linalg.eig(A)

(array([ 4.+0.00000000e+00j, -2.+1.10465796e-15j, -2.-1.10465796e-15j]),
 matrix([[-0.40824829+0.j        ,  0.24400118-0.40702229j,
           0.24400118+0.40702229j],
         [-0.40824829+0.j        , -0.41621909-0.40702229j,
          -0.41621909+0.40702229j],
         [-0.81649658+0.j        , -0.66022027+0.j        ,
          -0.66022027-0.j        ]]))

In [7]:
Q = np.mat([
    [1, -1, 1],
    [1, 0, 1],
    [0, 1, 2]
])

Sg = np.mat([
    [-2, 0, 0],
    [0, -2, 0],
    [0, 0, 4]
])

In [8]:
(Q @ Sg) @ np.linalg.inv(Q)

matrix([[ 1., -3.,  3.],
        [ 3., -5.,  3.],
        [ 6., -6.,  4.]])

#### SVD分解

In [165]:
A = np.mat([
    [0, 1],
    [1, 1],
    [1, 0]
])


### 手动计算U，SIGMA和V，再输入
U = np.mat([ 
    [1/np.sqrt(6),   1/np.sqrt(2),      1/np.sqrt(3)],
    [2/np.sqrt(6),       0,              -1/np.sqrt(3)],
    [1/np.sqrt(6),     -1/np.sqrt(2),      1/np.sqrt(3)]  
]) 

SIGMA = np.mat([
    [np.sqrt(3), 0],
    [0,          1],
    [0,          0]
]) 

V = np.mat([ 
    [1/np.sqrt(2), -1/np.sqrt(2)], 
    [1/np.sqrt(2), 1/np.sqrt(2)] 
])

In [166]:
(U @ SIGMA) @ V.T

matrix([[8.86511593e-17, 1.00000000e+00],
        [1.00000000e+00, 1.00000000e+00],
        [1.00000000e+00, 8.86511593e-17]])

In [168]:
### 软件计算的结果np.linalg.svd()
a, b, c = np.linalg.svd(A)
np.linalg.svd(A)

(matrix([[-4.08248290e-01,  7.07106781e-01,  5.77350269e-01],
         [-8.16496581e-01,  2.64811510e-17, -5.77350269e-01],
         [-4.08248290e-01, -7.07106781e-01,  5.77350269e-01]]),
 array([1.73205081, 1.        ]),
 matrix([[-0.70710678, -0.70710678],
         [-0.70710678,  0.70710678]]))

In [169]:
np.mat(a) @ np.mat([
    [1.73205081, 0],
    [0,          1],
    [0,          0]
]) @ np.mat(c)

matrix([[7.01804828e-10, 1.00000000e+00],
        [1.00000000e+00, 1.00000000e+00],
        [1.00000000e+00, 7.01804435e-10]])

In [152]:
# A = np.mat([
#     [4, 11, 14],
#     [8, 7, -2]
# ])

# a, b, c = np.linalg.svd(A)
# np.linalg.svd(A)

# np.mat(a) @ np.mat([
#     [18.97366596, 0, 0],
#     [0,          9.48683298, 0]
# ]) @ np.mat(c)

#### 协方差阵计算

In [2]:
np.random.seed(0)
X = np.random.randint(0, 100, size = (4, 3))
X

array([[44, 47, 64],
       [67, 67,  9],
       [83, 21, 36],
       [87, 70, 88]])

In [10]:
# 用列表示向量，计算协方差阵
np.cov(X, rowvar=False)

array([[ 380.91666667,   -3.75      ,   74.58333333],
       [  -3.75      ,  510.91666667,  143.58333333],
       [  74.58333333,  143.58333333, 1171.58333333]])

In [11]:
# 对数据进行去中心化，利用np.dot(X.T, X) / (len(X)-1) 也可以得到协方差阵
X = X - X.mean(axis = 0)  # axis = 0表示对每一列求均值，去中心化

In [12]:
np.dot(X.T, X) / (len(X)-1)

array([[ 380.91666667,   -3.75      ,   74.58333333],
       [  -3.75      ,  510.91666667,  143.58333333],
       [  74.58333333,  143.58333333, 1171.58333333]])

In [13]:
# 此时的X已经去中心化
np.cov(X, rowvar=False)

array([[ 380.91666667,   -3.75      ,   74.58333333],
       [  -3.75      ,  510.91666667,  143.58333333],
       [  74.58333333,  143.58333333, 1171.58333333]])

#### 手动计算PCA(特征值分解)

In [3]:
from sklearn.datasets import load_iris

df = pd.DataFrame(load_iris().data, columns = load_iris().feature_names)
# df = df.assign(target = load_iris().target)

# df_mean = df.apply(lambda x: (x - x.mean(axis = 0)) / x.std()) 
df_mean = df.apply(lambda x: x - x.mean(axis = 0))   # 按列减去均值

In [4]:
df_mean_cov_matrix = np.dot(df_mean.T, df_mean)/(len(df_mean)-1)  ## 算出协方差矩阵
df_mean_cov_matrix

array([[ 0.68569351, -0.042434  ,  1.27431544,  0.51627069],
       [-0.042434  ,  0.18997942, -0.32965638, -0.12163937],
       [ 1.27431544, -0.32965638,  3.11627785,  1.2956094 ],
       [ 0.51627069, -0.12163937,  1.2956094 ,  0.58100626]])

In [11]:
eig_value, eig_vectors = np.linalg.eig(df_mean_cov_matrix)   # 协方差矩阵的特征值和特征向量
print(eig_value)
print(eig_vectors)

[4.22824171 0.24267075 0.0782095  0.02383509]
[[ 0.36138659 -0.65658877 -0.58202985  0.31548719]
 [-0.08452251 -0.73016143  0.59791083 -0.3197231 ]
 [ 0.85667061  0.17337266  0.07623608 -0.47983899]
 [ 0.3582892   0.07548102  0.54583143  0.75365743]]


In [3]:
# 0.58202985**2 + 0.59791083**2 + 0.07623608**2 + 0.54583143**2

In [36]:
k = 2
explained_variance = eig_value[:k]  # 解释性方差，即特征值
components = eig_vectors[:,:k]      # 特征值对应的特征向量
explained_variance_ratio = [eig_value[i] / sum(eig_value) for i in range(k)]   # 方差贡献率

print("降维后的各主成分的方差值：", explained_variance)
print("降维后的各主成分方差的贡献率：", explained_variance_ratio)

降维后的各主成分的方差值： [4.22824171 0.24267075]
降维后的各主成分方差的贡献率： [0.924618723201727, 0.05306648311706795]


In [66]:
np.dot(df_mean, components)  # 降维后的数据

array([[-2.68412563, -0.31939725],
       [-2.71414169,  0.17700123],
       [-2.88899057,  0.14494943],
       [-2.74534286,  0.31829898],
       [-2.72871654, -0.32675451],
       [-2.28085963, -0.74133045],
       [-2.82053775,  0.08946138],
       [-2.62614497, -0.16338496],
       [-2.88638273,  0.57831175],
       [-2.6727558 ,  0.11377425],
       [-2.50694709, -0.6450689 ],
       [-2.61275523, -0.01472994],
       [-2.78610927,  0.235112  ],
       [-3.22380374,  0.51139459],
       [-2.64475039, -1.17876464],
       [-2.38603903, -1.33806233],
       [-2.62352788, -0.81067951],
       [-2.64829671, -0.31184914],
       [-2.19982032, -0.87283904],
       [-2.5879864 , -0.51356031],
       [-2.31025622, -0.39134594],
       [-2.54370523, -0.43299606],
       [-3.21593942, -0.13346807],
       [-2.30273318, -0.09870885],
       [-2.35575405,  0.03728186],
       [-2.50666891,  0.14601688],
       [-2.46882007, -0.13095149],
       [-2.56231991, -0.36771886],
       [-2.63953472,

#### PCA(SVD分解)

In [8]:
u,sigma,v = np.linalg.svd(df_mean_cov_matrix)  # 对协方差矩阵SVD分解

np.dot(df_mean, u[:, :2]) # 奇异值分解，降维后的数据

array([[ 2.68412563, -0.31939725],
       [ 2.71414169,  0.17700123],
       [ 2.88899057,  0.14494943],
       [ 2.74534286,  0.31829898],
       [ 2.72871654, -0.32675451],
       [ 2.28085963, -0.74133045],
       [ 2.82053775,  0.08946138],
       [ 2.62614497, -0.16338496],
       [ 2.88638273,  0.57831175],
       [ 2.6727558 ,  0.11377425],
       [ 2.50694709, -0.6450689 ],
       [ 2.61275523, -0.01472994],
       [ 2.78610927,  0.235112  ],
       [ 3.22380374,  0.51139459],
       [ 2.64475039, -1.17876464],
       [ 2.38603903, -1.33806233],
       [ 2.62352788, -0.81067951],
       [ 2.64829671, -0.31184914],
       [ 2.19982032, -0.87283904],
       [ 2.5879864 , -0.51356031],
       [ 2.31025622, -0.39134594],
       [ 2.54370523, -0.43299606],
       [ 3.21593942, -0.13346807],
       [ 2.30273318, -0.09870885],
       [ 2.35575405,  0.03728186],
       [ 2.50666891,  0.14601688],
       [ 2.46882007, -0.13095149],
       [ 2.56231991, -0.36771886],
       [ 2.63953472,

In [6]:
u

array([[-0.36138659, -0.65658877,  0.58202985,  0.31548719],
       [ 0.08452251, -0.73016143, -0.59791083, -0.3197231 ],
       [-0.85667061,  0.17337266, -0.07623608, -0.47983899],
       [-0.3582892 ,  0.07548102, -0.54583143,  0.75365743]])

In [7]:
v

array([[-0.36138659,  0.08452251, -0.85667061, -0.3582892 ],
       [-0.65658877, -0.73016143,  0.17337266,  0.07548102],
       [ 0.58202985, -0.59791083, -0.07623608, -0.54583143],
       [ 0.31548719, -0.3197231 , -0.47983899,  0.75365743]])

In [9]:
sigma

array([4.22824171, 0.24267075, 0.0782095 , 0.02383509])

#### 调用sklearn接口计算PCA

In [5]:
import numpy as np
import pandas as pd  
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
df = pd.DataFrame(load_iris().data, columns = load_iris().feature_names)

In [6]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [7]:
pca = PCA(n_components=2)  
pca.fit(df) 

print("降维后的各主成分的方差值：", pca.explained_variance_)
print("降维后的各主成分方差的贡献率：", pca.explained_variance_ratio_)

降维后的各主成分的方差值： [4.22824171 0.24267075]
降维后的各主成分方差的贡献率： [0.92461872 0.05306648]


In [21]:
pca.components_

array([[ 0.36138659, -0.08452251,  0.85667061,  0.3582892 ],
       [ 0.65658877,  0.73016143, -0.17337266, -0.07548102]])

In [27]:
# 利用主成分模型将原始数据transform为降维后的数据
data_new = pca.transform(df)
data_new

array([[-2.68412563,  0.31939725],
       [-2.71414169, -0.17700123],
       [-2.88899057, -0.14494943],
       [-2.74534286, -0.31829898],
       [-2.72871654,  0.32675451],
       [-2.28085963,  0.74133045],
       [-2.82053775, -0.08946138],
       [-2.62614497,  0.16338496],
       [-2.88638273, -0.57831175],
       [-2.6727558 , -0.11377425],
       [-2.50694709,  0.6450689 ],
       [-2.61275523,  0.01472994],
       [-2.78610927, -0.235112  ],
       [-3.22380374, -0.51139459],
       [-2.64475039,  1.17876464],
       [-2.38603903,  1.33806233],
       [-2.62352788,  0.81067951],
       [-2.64829671,  0.31184914],
       [-2.19982032,  0.87283904],
       [-2.5879864 ,  0.51356031],
       [-2.31025622,  0.39134594],
       [-2.54370523,  0.43299606],
       [-3.21593942,  0.13346807],
       [-2.30273318,  0.09870885],
       [-2.35575405, -0.03728186],
       [-2.50666891, -0.14601688],
       [-2.46882007,  0.13095149],
       [-2.56231991,  0.36771886],
       [-2.63953472,

In [26]:
# 手动计算降维后的数据：方式一
((df - pca.mean_) @ pca.components_.T).values

array([[-2.68412563,  0.31939725],
       [-2.71414169, -0.17700123],
       [-2.88899057, -0.14494943],
       [-2.74534286, -0.31829898],
       [-2.72871654,  0.32675451],
       [-2.28085963,  0.74133045],
       [-2.82053775, -0.08946138],
       [-2.62614497,  0.16338496],
       [-2.88638273, -0.57831175],
       [-2.6727558 , -0.11377425],
       [-2.50694709,  0.6450689 ],
       [-2.61275523,  0.01472994],
       [-2.78610927, -0.235112  ],
       [-3.22380374, -0.51139459],
       [-2.64475039,  1.17876464],
       [-2.38603903,  1.33806233],
       [-2.62352788,  0.81067951],
       [-2.64829671,  0.31184914],
       [-2.19982032,  0.87283904],
       [-2.5879864 ,  0.51356031],
       [-2.31025622,  0.39134594],
       [-2.54370523,  0.43299606],
       [-3.21593942,  0.13346807],
       [-2.30273318,  0.09870885],
       [-2.35575405, -0.03728186],
       [-2.50666891, -0.14601688],
       [-2.46882007,  0.13095149],
       [-2.56231991,  0.36771886],
       [-2.63953472,

In [30]:
# 手动计算降维后的数据：方式二
(       (pca.components_ @ (df - pca.mean_).T).T     ).values

array([[-2.68412563,  0.31939725],
       [-2.71414169, -0.17700123],
       [-2.88899057, -0.14494943],
       [-2.74534286, -0.31829898],
       [-2.72871654,  0.32675451],
       [-2.28085963,  0.74133045],
       [-2.82053775, -0.08946138],
       [-2.62614497,  0.16338496],
       [-2.88638273, -0.57831175],
       [-2.6727558 , -0.11377425],
       [-2.50694709,  0.6450689 ],
       [-2.61275523,  0.01472994],
       [-2.78610927, -0.235112  ],
       [-3.22380374, -0.51139459],
       [-2.64475039,  1.17876464],
       [-2.38603903,  1.33806233],
       [-2.62352788,  0.81067951],
       [-2.64829671,  0.31184914],
       [-2.19982032,  0.87283904],
       [-2.5879864 ,  0.51356031],
       [-2.31025622,  0.39134594],
       [-2.54370523,  0.43299606],
       [-3.21593942,  0.13346807],
       [-2.30273318,  0.09870885],
       [-2.35575405, -0.03728186],
       [-2.50666891, -0.14601688],
       [-2.46882007,  0.13095149],
       [-2.56231991,  0.36771886],
       [-2.63953472,