In [2]:
import pandas as pd
import numpy as np

np.random.seed(1234)

data = np.random.randn(100, 10)
df = pd.DataFrame(data, columns=[f"var{i+1}" for i in range(10)])

df.head()

Unnamed: 0,var1,var2,var3,var4,var5,var6,var7,var8,var9,var10
0,0.471435,-1.190976,1.432707,-0.312652,-0.720589,0.887163,0.859588,-0.636524,0.015696,-2.242685
1,1.150036,0.991946,0.953324,-2.021255,-0.334077,0.002118,0.405453,0.289092,1.321158,-1.546906
2,-0.202646,-0.655969,0.193421,0.553439,1.318152,-0.469305,0.675554,-1.817027,-0.183109,1.058969
3,-0.39784,0.337438,1.047579,1.045938,0.863717,-0.122092,0.124713,-0.322795,0.841675,2.390961
4,0.0762,-0.566446,0.036142,-2.074978,0.247792,-0.897157,-0.136795,0.018289,0.755414,0.215269


In [9]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
scaled_data = ss.fit_transform(df)

df_s = pd.DataFrame(scaled_data, columns=[f"var{i+1}" for i in range(10)])

df_s.head()

Unnamed: 0,var1,var2,var3,var4,var5,var6,var7,var8,var9,var10
0,0.396759,-1.169771,1.509972,-0.242212,-0.542748,1.019232,0.761109,-0.744043,-0.102699,-2.106934
1,1.199705,1.076854,0.958638,-1.823511,-0.169623,0.045435,0.307314,0.234588,1.24502,-1.43036
2,-0.40084,-0.619152,0.084682,0.559349,1.425383,-0.473264,0.577213,-1.992162,-0.307939,1.103584
3,-0.6318,0.403246,1.06704,1.015153,0.986687,-0.091231,0.026784,-0.412345,0.750016,2.398808
4,-0.070899,-0.527016,-0.096204,-1.873232,0.392094,-0.94402,-0.234528,-0.051725,0.660963,0.283172


In [10]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_s, test_size=0.2, random_state=1234)
train.shape, test.shape

((80, 10), (20, 10))

In [17]:
from sklearn.decomposition import PCA

# svd_solver = SVD 알고리즘 방식 지정
# "auto" 기본값. 데이터 크기와 n_components에 따라 아래 알고리즘 중 적절한 것을 자동 선택함.
# "full" 정확한 고전적인 SVD 방식 사용. 데이터 크기가 작거나, 정확도가 중요한 경우에 적합. 계산 비용이 큼.
# "arpack" 희소한 데이터나 n_components < min(n_samples, n_features)인 경우에 적합한 희소 SVD 방식.
# "randomized" 대규모 데이터에 대해 근사적인 SVD 계산. 속도가 빠르고, 대용량 데이터셋에 적합. 정확도는 약간 떨어질 수 있음.
pca = PCA(svd_solver='full')
pca.fit(train)
var_list = pca.explained_variance_ratio_

var_list

array([0.15736462, 0.14131353, 0.12313415, 0.10998224, 0.10494287,
       0.09509089, 0.09022798, 0.07331753, 0.06028985, 0.04433633])

In [29]:
eigenvalues = pca.explained_variance_
eigenvectors = pca.components_

eigenvalues, eigenvectors

(array([1.66869861, 1.49849241, 1.30571776, 1.16625463, 1.11281695,
        1.00834634, 0.95677988, 0.77746108, 0.63931521, 0.47014364]),
 array([[ 0.22319146, -0.30790823, -0.56563344, -0.05256223,  0.52669645,
         -0.43814766, -0.12246097, -0.113729  , -0.07417794,  0.1727954 ],
        [-0.4323397 ,  0.01417735, -0.33773311, -0.17708182, -0.18709533,
         -0.04302926, -0.44670501,  0.35281449, -0.40817447, -0.37413067],
        [ 0.19841493,  0.09055921, -0.07049008,  0.28713364, -0.03659912,
          0.16862827, -0.36123419,  0.62576973,  0.17465679,  0.53165599],
        [-0.23067862,  0.51447626,  0.18734595, -0.59079668,  0.41274831,
         -0.11435047, -0.14367523,  0.06240294,  0.25932748,  0.1507902 ],
        [ 0.47396074,  0.54201885, -0.16837044,  0.00589565,  0.1806307 ,
          0.33020282,  0.11039871, -0.03986343, -0.53849773, -0.08817765],
        [ 0.33641472, -0.22494061, -0.1791467 , -0.66632958, -0.37912524,
          0.0354911 ,  0.34425125,  0.29623

In [18]:
# 60% 이상 설명할 수 있는 최소 차원의 수 찾기 => 5
for i in range(10):
    s = np.sum(var_list[:i])
    if(s > 0.6): print(i, s)

5 0.6367374081720558
6 0.7318283000849632
7 0.8220562793741056
8 0.8953738143649844
9 0.9556636682091915


In [20]:
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
cumulative_variance

array([0.15736462, 0.29867815, 0.4218123 , 0.53179454, 0.63673741,
       0.7318283 , 0.82205628, 0.89537381, 0.95566367, 1.        ])

In [22]:
min_components = np.argmax(cumulative_variance >= 0.60) + 1
min_components

5

In [26]:
# 차원 축소 

train_reduced = pd.DataFrame(pca.transform(train)[:, :min_components], columns=['v1','v2','v3','v4','v5'])
train_reduced.head()

Unnamed: 0,v1,v2,v3,v4,v5
0,1.364746,1.380137,-0.094587,-0.006229,0.720995
1,-1.875474,-0.766516,0.450379,-0.797257,-1.441772
2,-0.190794,0.582614,0.107553,-0.960593,2.221013
3,2.360462,0.792242,-0.882309,-0.115069,-1.478089
4,-1.266641,0.493597,-1.97391,0.613511,-1.357879


In [28]:
test_reduced = pd.DataFrame(pca.transform(test)[:, :min_components], columns=['v1','v2','v3','v4','v5'])
test_reduced.head()

Unnamed: 0,v1,v2,v3,v4,v5
0,-0.019794,-0.224283,0.42331,0.699655,-1.499431
1,-1.306574,0.564127,1.472364,1.029241,-0.30632
2,-0.426331,-0.875099,-0.407616,-0.331106,-0.489932
3,-0.189382,1.536476,0.391513,-2.171239,0.157084
4,-0.642996,0.140483,-1.724839,0.532127,0.494353


In [30]:
# --------------------------------------------------------- #

In [36]:
pca_reduced = PCA(n_components=min_components)
reduced_data = pca_reduced.fit_transform(train)
df_reduced = pd.DataFrame(reduced_data, columns=[f"v{i+1}" for i in range(min_components)])
df_reduced

Unnamed: 0,v1,v2,v3,v4,v5
0,1.364746,1.380137,-0.094587,-0.006229,0.720995
1,-1.875474,-0.766516,0.450379,-0.797257,-1.441772
2,-0.190794,0.582614,0.107553,-0.960593,2.221013
3,2.360462,0.792242,-0.882309,-0.115069,-1.478089
4,-1.266641,0.493597,-1.973910,0.613511,-1.357879
...,...,...,...,...,...
75,0.750611,2.213803,2.296645,0.583381,-0.958052
76,1.537186,1.775188,-0.888037,-0.084382,-0.812130
77,-0.317917,-1.864794,-2.070302,-0.288442,1.012760
78,-0.086512,-1.092315,0.748343,-0.734778,-1.293337


In [41]:
df_reduced.equals(train_reduced) # 소수점 자리가 다름.

False

In [52]:
num = 10
(round(train_reduced, num) == round(df_reduced, num)).sum()

v1    80
v2    80
v3    80
v4    80
v5    80
dtype: int64