In [138]:
import numpy as np

X = np.random.normal(0,1,8000).reshape(400,20)
X.shape

(400, 20)

In [139]:
X

array([[ 1.72937893,  0.49118789, -1.09216487, ...,  1.00044471,
         0.11658195, -0.70302656],
       [-0.99666307,  0.6509971 ,  0.19703353, ...,  0.72478987,
        -0.39096209,  1.63472503],
       [-1.9887786 ,  0.72914371, -0.13573506, ..., -0.13642947,
         0.42360556,  1.04582646],
       ...,
       [-0.493075  , -0.27626806,  0.03835059, ..., -0.88889574,
         2.14386985, -0.77115473],
       [-0.71987622, -1.50538083,  0.58524275, ..., -0.4838165 ,
         0.03953724,  1.39031354],
       [ 0.66957058, -0.82601234, -0.01203275, ...,  0.68105058,
        -0.38622869, -0.02631979]])

In [140]:
#from factor_analyzer import factor_analyzer
import pandas as pd
from factor_analyzer import FactorAnalyzer
from factor_analyzer import FactorAnalyzer, calculate_kmo, calculate_bartlett_sphericity

#### 是否适合因子分析检验

* #### 其中，R和Q分别是所有特征皮尔逊相关系数和偏相关系数的平方和。一般情况下，当M（也被称为KMO值）大于0.5时，数据适合进行因子分析
![jupyter](./images/是否适合因子分析.jpg)
* #### 根据自由度和统计值，通过查询卡方分布表，可以近似地得到巴特利特检验的伴生概率。根据伴生概率p与显著性水平α之间的关系，可以确定特征D1, D2, ..., DN是否适合进行因子分析。当p<α时，应拒绝原假设，则相关矩阵C不是一个单位矩阵，特征之间存在明显的相关性。本论文的显著性水平设为0.05，这也是当前最常用的显著性水平。

In [141]:
kmo = calculate_kmo(X)
bartlett = calculate_bartlett_sphericity(X)
print("\n因子分析适用性检验:")
print('kmo:{},bartlett:{}'.format(kmo[1], bartlett[1]))


因子分析适用性检验:
kmo:0.4741006549428256,bartlett:0.2768371034395654


#### 各个因子的特征值以及方差贡献率

In [142]:
fa = FactorAnalyzer(rotation=None, n_factors=38, method='principal')
fa.fit(X)
fa_sd = fa.get_factor_variance()
fa_df = pd.DataFrame(
    {'特征值': fa_sd[0], '方差贡献率': fa_sd[1], '方差累计贡献率': fa_sd[2]})
fa_df

Unnamed: 0,特征值,方差贡献率,方差累计贡献率
0,1.370755,0.068538,0.068538
1,1.350549,0.067527,0.136065
2,1.279556,0.063978,0.200043
3,1.262715,0.063136,0.263179
4,1.212939,0.060647,0.323826
5,1.163046,0.058152,0.381978
6,1.125786,0.056289,0.438267
7,1.090793,0.05454,0.492807
8,1.028939,0.051447,0.544254
9,1.020325,0.051016,0.59527


#### 设置公因子个数，重新拟合

In [143]:
factors = 19
fa = FactorAnalyzer(rotation=None, n_factors=factors, method='principal')
fa.fit(X)

FactorAnalyzer(method='principal', n_factors=19, rotation=None,
               rotation_kwargs={})

#### 查看公因子提取度

In [144]:

print("\n公因子提取度:\n", fa.get_communalities())


公因子提取度:
 [0.99729989 0.99997369 0.98934164 0.99289846 0.98406998 0.95107525
 0.99163049 0.97378168 0.967725   0.91596735 0.97047339 0.97151143
 0.98383446 0.99180406 0.9863695  0.93353828 0.94076445 0.98087178
 0.83999126 0.99334123]


#### 查看因子载荷

In [147]:

print("\n因子载荷矩阵:\n", fa.loadings_)



因子载荷矩阵:
 [[ 0.25101437  0.04704944 -0.19987097 -0.16529887 -0.6013224  -0.10425556
   0.28588445 -0.04931833 -0.09170763 -0.26733269 -0.07620526  0.14487418
   0.31231645 -0.03221765  0.11328649  0.01257102  0.11769657  0.01687367
   0.41929036]
 [-0.09338032 -0.060374   -0.20930236  0.54143431  0.17775231 -0.02813583
  -0.18441515  0.02429193 -0.19015033  0.1921072  -0.30857667 -0.00669187
   0.26305335  0.35594936  0.23532849  0.36133191  0.106148   -0.09437144
   0.11543978]
 [ 0.32344245  0.35495133  0.30674278 -0.46805379  0.04005022  0.0615229
   0.03659241 -0.018929   -0.01625903  0.08904284  0.00869202 -0.02700148
   0.35037136  0.15761016  0.2310563   0.09390636  0.25993528 -0.1827936
  -0.34452918]
 [-0.37340597  0.04338557  0.36115036  0.23622629 -0.23781837  0.05471662
  -0.11730911 -0.04104094  0.10159439 -0.50777703 -0.12880384 -0.06218354
  -0.00236158 -0.16401177  0.37150939  0.13127498  0.03559787  0.28816133
  -0.1878199 ]
 [-0.18492506  0.21059403 -0.4683577  -0.157

In [148]:
fa.transform(X).shape

(400, 19)

#### 使用最大方差法旋转因子载荷矩阵

In [149]:

fa_rotate = FactorAnalyzer(rotation='varimax', n_factors=factors, method='principal')
fa_rotate.fit(X)

FactorAnalyzer(method='principal', n_factors=19, rotation='varimax',
               rotation_kwargs={})

#### 查看旋转后的因子载荷

In [150]:
print("\n旋转后的因子载荷矩阵:\n", fa_rotate.loadings_)


旋转后的因子载荷矩阵:
 [[-1.52622553e-02 -1.37131696e-02 -7.12117049e-02 -4.18139869e-02
   9.90172491e-01 -2.55277796e-02  1.97728013e-02  5.12334637e-02
  -9.40152281e-03  8.19278676e-03 -1.29439789e-02 -4.45683566e-02
   4.48959389e-02 -1.31008906e-02 -4.83942036e-03 -2.87096206e-02
  -1.05948934e-02  1.30126977e-02 -1.80113394e-02]
 [-4.98097761e-03 -4.53794228e-04 -1.50570136e-02  9.95502408e-01
  -4.13438622e-02  6.24396823e-03  2.30614090e-03  2.39244344e-02
  -2.30550574e-02  8.24259185e-03  1.01332763e-02 -5.63994128e-03
  -5.28899914e-02  2.86331199e-02  1.55417128e-02 -3.78460910e-02
   4.71697806e-03  5.91175857e-04  1.79797189e-02]
 [-8.50717090e-03  1.19766835e-02  9.38013239e-02 -5.38006874e-02
   4.53023494e-02 -2.22446678e-03  2.44808425e-03 -4.77949801e-02
   1.72780329e-02 -3.52158494e-02  1.29448291e-02 -7.24729440e-03
   9.83674735e-01 -2.63726273e-02  1.14150167e-02  1.35994783e-02
  -1.33544863e-02  4.98458432e-02 -6.04523257e-03]
 [-9.18702597e-03 -3.07408271e-02  1.3503

#### 因子得分系数矩阵（回归方法）（系数矩阵的逆乘以因子载荷矩阵）

In [151]:
import numpy.linalg as nlg
data_corr = pd.DataFrame(X).corr()
X1 = np.mat(data_corr)
X1 = nlg.inv(X1)
factor_score = np.dot(X1, fa_rotate.loadings_)
factor_score = pd.DataFrame(factor_score)
factor_score

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.0392,0.004331,0.071889,0.042267,1.021563,0.043112,-0.004385,-0.034689,0.026789,-0.007848,-0.000715,0.036114,-0.043223,0.03037,0.010001,0.020951,-0.001415,-0.025178,-0.000451
1,0.004304,-0.002024,0.014041,1.013575,0.043423,-0.00332,-0.003183,-0.026923,0.024759,-0.009316,-0.010186,0.006447,0.04989,-0.027879,-0.015366,0.041213,-0.003725,-0.001706,-0.017629
2,-0.050297,0.003398,-0.079006,0.050721,-0.039555,-0.033411,-0.037851,0.001251,-0.049629,0.01867,0.013821,0.029133,1.014392,-0.014539,-0.025788,0.011225,0.0428,-0.013074,0.052667
3,-0.03208,0.051555,0.001954,-0.007804,-0.004147,-0.000245,-0.008985,0.012685,-0.023755,1.009666,0.04011,0.016744,0.016928,-0.065904,-0.019104,-0.034846,0.02527,0.082521,0.016196
4,-0.007982,-0.006971,-0.004369,-0.00563,-0.004709,0.005702,0.021958,0.066908,0.038794,0.021503,-0.039392,-0.010714,0.041491,0.045299,0.000665,0.01629,0.995385,-0.042811,0.027736
5,-0.088488,0.034823,0.023413,-0.023474,0.040476,-0.025619,-0.088937,-0.053987,-0.03998,-0.065894,-0.005338,0.060852,-0.02074,0.946141,-0.021024,0.028342,0.055875,0.018684,0.074751
6,0.062272,-0.004161,1.015708,0.012497,0.069761,0.04265,0.03912,0.021936,-0.000209,-0.001608,-0.015451,0.00984,-0.081507,0.013057,-0.020174,-0.012636,-0.003035,-0.008461,-0.009732
7,-0.006008,0.017927,0.004704,0.02731,0.032996,-0.04056,-0.022379,-0.06385,0.97648,-0.021938,0.030258,0.030195,-0.051655,-0.033886,0.004733,0.068076,0.04329,0.041173,0.023264
8,-0.012138,0.026264,0.046343,-0.000126,0.003121,-0.048517,0.966942,-0.02166,-0.02491,-0.008249,0.019553,0.050887,-0.041552,-0.086333,-0.02558,0.011493,0.028883,0.052682,0.08056
9,0.902147,0.054846,0.075573,0.009259,0.052328,-0.060538,-0.022489,-0.060258,-0.019708,-0.033939,0.081875,0.046931,-0.059203,-0.091265,-0.018716,0.010083,0.011265,0.076141,0.116834


#### 因子得分

In [152]:
factor_score.columns = ['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8','F9', 'F10', 'F11','F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18', 'F19']
factor_score.index = data_corr.columns
factor_score

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19
0,0.0392,0.004331,0.071889,0.042267,1.021563,0.043112,-0.004385,-0.034689,0.026789,-0.007848,-0.000715,0.036114,-0.043223,0.03037,0.010001,0.020951,-0.001415,-0.025178,-0.000451
1,0.004304,-0.002024,0.014041,1.013575,0.043423,-0.00332,-0.003183,-0.026923,0.024759,-0.009316,-0.010186,0.006447,0.04989,-0.027879,-0.015366,0.041213,-0.003725,-0.001706,-0.017629
2,-0.050297,0.003398,-0.079006,0.050721,-0.039555,-0.033411,-0.037851,0.001251,-0.049629,0.01867,0.013821,0.029133,1.014392,-0.014539,-0.025788,0.011225,0.0428,-0.013074,0.052667
3,-0.03208,0.051555,0.001954,-0.007804,-0.004147,-0.000245,-0.008985,0.012685,-0.023755,1.009666,0.04011,0.016744,0.016928,-0.065904,-0.019104,-0.034846,0.02527,0.082521,0.016196
4,-0.007982,-0.006971,-0.004369,-0.00563,-0.004709,0.005702,0.021958,0.066908,0.038794,0.021503,-0.039392,-0.010714,0.041491,0.045299,0.000665,0.01629,0.995385,-0.042811,0.027736
5,-0.088488,0.034823,0.023413,-0.023474,0.040476,-0.025619,-0.088937,-0.053987,-0.03998,-0.065894,-0.005338,0.060852,-0.02074,0.946141,-0.021024,0.028342,0.055875,0.018684,0.074751
6,0.062272,-0.004161,1.015708,0.012497,0.069761,0.04265,0.03912,0.021936,-0.000209,-0.001608,-0.015451,0.00984,-0.081507,0.013057,-0.020174,-0.012636,-0.003035,-0.008461,-0.009732
7,-0.006008,0.017927,0.004704,0.02731,0.032996,-0.04056,-0.022379,-0.06385,0.97648,-0.021938,0.030258,0.030195,-0.051655,-0.033886,0.004733,0.068076,0.04329,0.041173,0.023264
8,-0.012138,0.026264,0.046343,-0.000126,0.003121,-0.048517,0.966942,-0.02166,-0.02491,-0.008249,0.019553,0.050887,-0.041552,-0.086333,-0.02558,0.011493,0.028883,0.052682,0.08056
9,0.902147,0.054846,0.075573,0.009259,0.052328,-0.060538,-0.022489,-0.060258,-0.019708,-0.033939,0.081875,0.046931,-0.059203,-0.091265,-0.018716,0.010083,0.011265,0.076141,0.116834


In [127]:
fa_t_score = np.dot(np.mat(X), np.mat(factor_score))
F_factors = pd.DataFrame(fa_t_score,columns=factor_score.columns)
F_factors

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19
0,1.966442,-0.781925,-1.001932,0.237488,-0.679117,0.205779,-0.501536,-0.413470,1.273523,1.204803,1.640599,1.406951,-0.049024,0.577568,-0.457655,-0.039107,0.142317,1.371910,-2.293096
1,-2.312696,1.160875,1.590837,-0.936827,0.216236,-2.170603,1.107468,-0.074979,-0.051203,0.424307,-1.716327,-0.898038,-1.303240,-1.455782,-0.816375,-0.387331,0.093053,-0.420900,-2.070036
2,0.350762,1.602506,1.763578,0.673131,0.149583,0.624532,0.254475,0.133674,1.236484,-0.969596,-0.036881,-0.782023,-0.951162,-0.902434,-1.319634,1.507121,-0.039473,0.360071,0.743462
3,-0.912258,0.516650,-0.756864,0.435018,-2.067303,1.743916,0.228434,0.065096,-1.229425,1.128219,1.764262,-0.166315,0.360249,-0.260914,-1.122921,1.532733,0.985550,-1.532641,-1.208909
4,-1.360127,0.580457,-0.747916,0.420657,0.140287,-0.151758,-0.163570,-0.645126,-1.049301,1.933078,-1.624141,-1.332897,-2.024421,-1.275262,1.727452,1.668128,-0.271720,1.058525,1.044577
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,-1.117295,-0.165702,-0.617644,0.617462,0.684152,-0.309996,0.299201,-0.640928,-0.536773,0.417371,1.022802,-1.221435,1.226766,-0.742905,0.011968,-0.882381,0.588628,-0.330319,-0.758893
396,-0.383354,0.714908,0.213844,0.398711,0.540778,0.019278,-1.027768,0.179623,0.114938,-0.493236,0.408443,-0.767093,-1.547367,0.835054,-1.082327,0.554991,-0.066951,0.265157,0.358608
397,-0.373192,-1.601794,0.791302,0.112089,0.571566,-2.329558,0.828313,1.204900,-0.998646,-0.338523,0.116594,1.087295,1.343995,-1.144388,0.329565,-0.761981,-1.564318,-0.260572,0.740435
398,-0.466674,0.885831,0.077015,0.378512,0.574737,-0.153949,-0.169178,0.009999,0.989270,-1.889683,1.428152,0.592251,-0.055617,-0.593312,-0.028467,0.606780,0.157557,-0.321664,-0.588835


#### 选择适合的因子个数 

In [128]:
fa = FactorAnalyzer(n_factors=1
                    ,rotation="varimax"
                   ,method="principal"
                   ,use_smc=True)
fa.fit(X)

  "No rotation will be performed when "


FactorAnalyzer(method='principal', n_factors=1, rotation='varimax',
               rotation_kwargs={})

#### 因子载荷矩阵

In [129]:
fa.loadings_

array([[-0.68165989],
       [ 0.37692989],
       [ 0.30957138],
       [-0.03160229],
       [ 0.2020439 ],
       [-0.08688794],
       [ 0.1910542 ],
       [ 0.30283512],
       [-0.03193117],
       [ 0.3707459 ],
       [-0.16801542],
       [ 0.3587255 ],
       [ 0.24698223],
       [-0.00670426],
       [ 0.26426783],
       [ 0.18052656],
       [-0.05187077],
       [-0.09203126],
       [ 0.11987198],
       [ 0.3090591 ]])

#### 因各变量的共同度

In [130]:
fa.get_communalities()

array([4.64660209e-01, 1.42076146e-01, 9.58344414e-02, 9.98704837e-04,
       4.08217376e-02, 7.54951485e-03, 3.65017077e-02, 9.17091079e-02,
       1.01959977e-03, 1.37452521e-01, 2.82291825e-02, 1.28683984e-01,
       6.10002241e-02, 4.49471169e-05, 6.98374843e-02, 3.25898388e-02,
       2.69057713e-03, 8.46975264e-03, 1.43692904e-02, 9.55175272e-02])

#### 因子得分

In [131]:

data_newf = fa.transform(X)
scores_f = pd.DataFrame(data_newf,columns=[["因子"]])
scores_f

Unnamed: 0,因子
0,-1.475829
1,0.333635
2,0.377000
3,0.008845
4,1.873208
...,...
395,-0.210742
396,0.257845
397,-0.622075
398,0.634246
