In [283]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import numpy.linalg as LA
import math
import sys
from scipy.stats import chi2, norm, zscore
from matplotlib import pyplot

"""
・主成分分析に適切な固有ベクトルに変換する関数
    pca関数の補助
"""
def verify_eig_vecs(eig_vecs):
    p = eig_vecs.shape[1]
    verify_eig_vecs = np.empty((0,p))
    for i in range(p):
        verify_eig_vecs_T_i = np.zeros(p)
        if np.ones(p).dot(eig_vecs.T[i]) < 0:
            verify_eig_vecs_T_i = -1 * eig_vecs.T[i]
        else:
            verify_eig_vecs_T_i = eig_vecs.T[i]
        verify_eig_vecs = np.append(verify_eig_vecs, np.array([verify_eig_vecs_T_i]), axis=0)
        
    return verify_eig_vecs.T

def eigen(Matrix):
    eig_vals, eig_vecs = LA.eig(Matrix)
    eig_vecs = verify_eig_vecs(eig_vecs)
        
    eig_id = np.argsort(eig_vals)[::-1]
        
    eig_vals = eig_vals[eig_id]
    eig_vecs = eig_vecs.T[eig_id].T
        
    return eig_vals, eig_vecs

def standardize(designMatrix):
    n = designMatrix.shape[0]
    p = designMatrix.shape[1]
    standardizedMatrix = np.empty((0,n))
        
    for i in range(p):
        standardizedParameter_i = zscore(designMatrix.T[i])
        standardizedMatrix = np.append(standardizedMatrix, np.array([standardizedParameter_i]), axis=0)
        
    return standardizedMatrix.T

"""
・主成分分析に用いる計画行列の固有ベクトル及び固有値の標準誤差を求める関数
    C: NumPy配列の共分散行列か相関行列
"""
def vce(C):
    eig_vals, eig_vecs = LA.eig(C)
    eig_vecs = verify_eig_vecs(eig_vecs)
    p = C.shape[1]    # number of parameters
    n = C.shape[0]    # number of data
        
    v = np.empty((0,p))
    r = np.array([])
    for i in range(p):
        v_i = np.zeros(p)
        for j in range(p):
            if j != i:
                u_ij = eig_vals[i] * eig_vals[j] / ((eig_vals[i] - eig_vals[j]) ** 2) / (n - 1)
                v_ij = np.square(list(eig_vecs[k][j] for k in range(p)))
                v_i += u_ij * v_ij
                
        r_i = 2 * eig_vals[i] ** 2 / (n - 1)

        r = np.append(r, np.array([r_i]))
        v = np.append(v, np.array([v_i]), axis=0)
    
    v = np.sqrt(v.T / n)
    r = np.sqrt(r / n)
    return v, r

def pcf(Matrix, target):
    designMatrix = Matrix[target].values
    Variables = {'Variables': target}
    FactorLoading = DataFrame(Variables)
    
    n = designMatrix.shape[0]
    p = designMatrix.shape[1]
        
    FactorResult = {'Factor': ['Factor' + str(i + 1) for i in range(p)]}
    Factor = DataFrame(FactorResult)
        
    R = np.corrcoef(standardize(designMatrix), rowvar=False)
    eigenvalue, eigenvectors = eigen(R)
    sum_eigenvalue = sum(eigenvalue)
        
    q = 0
    for i in range(p):
        if eigenvalue[i] >= 1:
            q = i + 1
        if eigenvalue[i] < 1:
            break
        
    L = np.empty((0,p))
    for i in range(q):
        loading_i = math.sqrt(eigenvalue[i]) * eigenvectors.T[i]
        L = np.append(L, np.array([loading_i]), axis=0)
        FactorLoading['Factor ' + str(i + 1)] = loading_i.T
            
    LL = (L.T).dot(L)
    DD = R - LL
        
    FactorLoading['Uniqueness'] = np.array([DD[i][i] for i in range(p)])
        
    Factor['Eigenvalue'] = np.array([eigenvalue[i] for i in range(p)])
    Factor['Proportion'] = np.array([eigenvalue[i]/sum_eigenvalue for i in range(p)])
        
    cumulative = 0
    cumulativeList = []
    for i in range(p):
        cumulative += eigenvalue[i]/sum_eigenvalue
        cumulativeList.append(cumulative) 
        
    Factor['Cumulative'] = np.array(cumulativeList)
        
    print('Factor analysis/correlation')
    print('\tMethod: prinipal-component factors')
    print('\tRotation: (unrotated)\n')
    print(Factor)
    independence = - ( n - (2 * p + 5) / 6) * math.log(LA.det(R))
    df = p * (p-1) / 2
    print( 'LR test for independence: chi' + '('+ str(round(df)) + ') = ' + str(round(independence, 2)) 
            + '  Prob > chi2 = ' +str(round(chi2.sf(independence, df, loc=0, scale=1), 4)))
    print('\nFactor loadings (pattern matrix) and unique variances\n')
    print(FactorLoading)
    
def factor(Matrix, target):
    designMatrix = Matrix[target].values
    Variables = {'Variables': target}
    FactorLoading = DataFrame(Variables)
    
    n = designMatrix.shape[0]
    p = designMatrix.shape[1]
        
    FactorResult = {'Factor': ['Factor' + str(i + 1) for i in range(p)]}
    Factor = DataFrame(FactorResult)
        
    R = np.corrcoef(standardize(designMatrix), rowvar=False)
    eigenvalue = eigen(R)[0]
    R_0 = R - np.identity(p)
    invR = LA.inv(R)
        
    q = 0
    for i in range(p):
        if eigenvalue[i] >= 1:
            q = i + 1
        if eigenvalue[i] < 1:
            break
        
    R_1 = R - np.diag([1 / invR[i][i] for i in range(p)])
    eigenvalue, eigenvectors = eigen(R_1)
    sum_eigenvalue = sum(eigenvalue)
        
    L = np.empty((0,p))
    for i in range(q):
        loading_i = math.sqrt(eigenvalue[i]) * eigenvectors.T[i]
        L = np.append(L, np.array([loading_i]), axis=0)
        FactorLoading['Factor ' + str(i + 1)] = loading_i.T
    LL = (L.T).dot(L)
    LL = R_0 + np.diag([LL[i][i] for i in range(p)])
        
    DD = R - LL
        
    FactorLoading['Uniqueness'] = np.array([DD[i][i] for i in range(p)])
        
    Factor['Eigenvalue'] = np.array([eigenvalue[i] for i in range(p)])
    Factor['Proportion'] = np.array([eigenvalue[i]/sum_eigenvalue for i in range(p)])
        
    cumulative = 0
    cumulativeList = []
    for i in range(p):
        cumulative += eigenvalue[i]/sum_eigenvalue
        cumulativeList.append(cumulative) 
        
    Factor['Cumulative'] = np.array(cumulativeList)
        
    print('Factor analysis/correlation')
    print('\tMethod: principal factors')
    print('\tRotation: (unrotated)\n')
    print(Factor)
    independence = - ( n - (2 * p + 5) / 6) * math.log(LA.det(R))
    df = p * (p-1) / 2
    print( 'LR test for independence: chi' + '('+ str(round(df)) + ') = ' + str(round(independence, 2)) 
            + '  Prob > chi2 = ' +str(round(chi2.sf(independence, df, loc=0, scale=1), 4)))
    print('\nFactor loadings (pattern matrix) and unique variances\n')
    print('\nFactor loadings (pattern matrix) and unique variances\n')
    print(FactorLoading)
    
    return L.T
    
def ipf(Matrix, target):
    designMatrix = Matrix[target].values
    Variables = {'Variables': target}
    FactorLoading = DataFrame(Variables)
    
    n = designMatrix.shape[0]
    p = designMatrix.shape[1]
        
    FactorResult = {'Factor': ['Factor' + str(i + 1) for i in range(p)]}
    Factor = DataFrame(FactorResult)
        
    R = np.corrcoef(standardize(designMatrix), rowvar=False)
    eigenvalue = eigen(R)[0]
    R_0 = R - np.identity(p)
    invR = LA.inv(R)
        
    q = 0
    for i in range(p):
        if eigenvalue[i] >= 1:
            q = i + 1
        if eigenvalue[i] < 1:
            break
        
    for i in range(1, 1000):
        if i == 1:
            R_i = R - np.diag([1 / invR[k][k] for k in range(p)])
            preEigenvalue, preEigenvectors = eigen(R_i)
            preTrace = np.trace(R_i)
            
        else:
            L_i = np.empty((0,p))
            for j in range(q):
                loading_j = math.sqrt(preEigenvalue[j]) * preEigenvectors.T[j]
                L_i = np.append(L_i, np.array([loading_j]), axis=0)
            R_i = (L_i.T).dot(L_i)
            R_i = R_0 + np.diag([R_i[k][k] for k in range(p)])
            postTrace = np.trace(R_i)
            Q = (preTrace - postTrace) ** 2
                
            if Q < 0.0000001:
                DD = R - R_i
                    
                for j in range(q):
                    loading_j = math.sqrt(preEigenvalue[j]) * preEigenvectors.T[j]
                    FactorLoading['Factor ' + str(j + 1)] = loading_j.T
                    
                eigenvalue, eigenvectors = eigen(R_i)
                sum_eigenvalue = sum(eigenvalue)
                    
                FactorLoading['Uniqueness'] = np.array([DD[i][i] for i in range(p)])
        
                Factor['Eigenvalue'] = np.array([eigenvalue[i] for i in range(p)])
                Factor['Proportion'] = np.array([eigenvalue[i]/sum_eigenvalue for i in range(p)])
                    
                cumulative = 0
                cumulativeList = []
                for i in range(p):
                    cumulative += eigenvalue[i]/sum_eigenvalue
                    cumulativeList.append(cumulative) 
        
                Factor['Cumulative'] = np.array(cumulativeList)        
                break
            
            preEigenvalue, preEigenvectors = eigen(R_i)
            preTrace = postTrace
        
    print('Factor analysis/correlation')
    print('\tMethod: iterated principal factors')
    print('\tRotation: (unrotated)\n')
    print(Factor)
    independence = - ( n - (2 * p + 5) / 6) * math.log(LA.det(R))
    df = p * (p-1) / 2
    print( 'LR test for independence: chi' + '('+ str(round(df)) + ') = ' + str(round(independence, 2)) 
            + '  Prob > chi2 = ' +str(round(chi2.sf(independence, df, loc=0, scale=1), 4)))
    print('\nFactor loadings (pattern matrix) and unique variances\n')
    print(FactorLoading)

'''
test
'''
def standardizeL(L):
    p = L.shape[0]
    q = L.shape[1]
    standardizedL = np.empty((0,q))
        
    for i in range(p):
        standardizedParameter_i = L[i]/np.sqrt(np.sum(np.square(L), axis=1))[i]
        standardizedL = np.append(standardizedL, np.array([standardizedParameter_i]), axis=0)
        
    return standardizedL

if __name__ == '__main__':
    
    data = pd.read_csv("testdata2.csv")
    target = ['Japanese','Math','English','Science','Sociology']    
    Matrix = data
    
    #pcf(Matrix, target)
    #print('\n')
    L = factor(Matrix, target)
    print('\n')
    #print('\n')
    #ipf(Matrix, target)
    print(standardizeL(L))


Factor analysis/correlation
	Method: principal factors
	Rotation: (unrotated)

    Factor  Eigenvalue  Proportion  Cumulative
0  Factor1    2.444886    0.878825    0.878825
1  Factor2    0.725566    0.260808    1.139633
2  Factor3   -0.039338   -0.014140    1.125493
3  Factor4   -0.134474   -0.048337    1.077156
4  Factor5   -0.214646   -0.077156    1.000000
LR test for independence: chi(10) = 41.58  Prob > chi2 = 0.0

Factor loadings (pattern matrix) and unique variances


Factor loadings (pattern matrix) and unique variances

   Variables  Factor 1  Factor 2  Uniqueness
0   Japanese  0.734395 -0.328857    0.352517
1       Math  0.545892  0.548157    0.401526
2    English  0.809888 -0.032515    0.343024
3    Science  0.609014  0.418246    0.454172
4  Sociology  0.762060 -0.375440    0.278309


[[ 0.91267378 -0.40868885]
 [ 0.70564088  0.70856964]
 [ 0.99919505 -0.04011541]
 [ 0.82432659  0.56611454]
 [ 0.89704398 -0.44194128]]


In [252]:
0.20197952 / 1.4931936

0.13526679996485386