## Test PCA with scikit-learn

In [18]:
#Test PCA with scikit-learn
import numpy as np
from sklearn.decomposition import PCA
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])   #[nsamples,nfeatures]

pca = PCA(n_components=2)
pca1 = pca.fit(X)

print(pca.explained_variance_ratio_) #pca.explained_variance_, that is the eigen value matrix of coviarance matrix
print(pca.singular_values_)
#print(X)
print('----------------------------')
print('number of features/samples in training set',pca.n_features_, pca.n_samples_)
print("Reduced data with n_components features: ", pca.fit_transform(X).shape) # fit and transform
print(pca.fit_transform(X))
print('--------------------')
print("Principal axes in feature space,array, shape (n_components, n_features):",pca.components_.shape)
print("pca.components_",pca.components_)
#res1 = np.ascontiguousarray(pca1.transform(X))
#print(res1)
#res2 = np.asfortranarray(res1)
#print(res2)

[0.99244289 0.00755711]
[6.30061232 0.54980396]
----------------------------
number of features/samples in training set 2 6
Reduced data with n_components features:  (6, 2)
[[ 1.38340578  0.2935787 ]
 [ 2.22189802 -0.25133484]
 [ 3.6053038   0.04224385]
 [-1.38340578 -0.2935787 ]
 [-2.22189802  0.25133484]
 [-3.6053038  -0.04224385]]
--------------------
Principal axes in feature space,array, shape (n_components, n_features): (2, 2)
pca.components_ [[-0.83849224 -0.54491354]
 [ 0.54491354 -0.83849224]]


## Test PCA with my numpy implimentation

In [31]:
#my numpy implimentation
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])   #[nsamples,nfeatures]
#X = X - X.mean(axis=0)
cov = np.cov(X.T) #equavalent to   print(np.dot(X.T,X)/(X.shape[0]-1))

print("Covariance matrix: ", cov)
v, w = np.linalg.eig(cov)  #v: eigen value,shape [nfeature]  w: eigen vector. shape [nfeature,n_components]

idx = v.argsort()[::-1]  # sort eigen value
v = v[idx]
w = w[:,idx]
print("Eigenvalue: ", v)
print("Eigenvector: ", w)
print("prcentage of Eigenvalue, loss of pca compression:",v[0]/v.sum())
#retain the first k feature axis
k=1
print("Sonuc: ", X.dot(w[:, :k])) #print("Sonuc: ", np.dot(X, w[:, :k]))

Covariance matrix:  [[5.6 3.6]
 [3.6 2.4]]
Eigenvalue:  [7.93954312 0.06045688]
Eigenvector:  [[ 0.83849224 -0.54491354]
 [ 0.54491354  0.83849224]]
prcentage of Eigenvalue, loss of pca compression: 0.9924428900898052
Sonuc:  [[-1.38340578]
 [-2.22189802]
 [-3.6053038 ]
 [ 1.38340578]
 [ 2.22189802]
 [ 3.6053038 ]]


## Test PCA with my numpy+svd implimentation

In [29]:
#my numpy implimentation, use svd
#ref <<Statistics , Data Mining , and Machine Learning in Astronomy>>
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])   #[nsamples,nfeatures]
cov = np.cov(X.T) #equavalent to   print(np.dot(X.T,X)/(X.shape[0]-1))
print("Covariance matrix: ", cov)

#method 1--------------
U, S, V = np.linalg.svd(X.T)  #v: eigen value  w: eigen vector
k=1
Z = np.dot(X, U[:,:k])
print(Z.shape,Z)
print("singular value",S)

Covariance matrix:  [[5.6 3.6]
 [3.6 2.4]]
(6, 1) [[ 1.38340578]
 [ 2.22189802]
 [ 3.6053038 ]
 [-1.38340578]
 [-2.22189802]
 [-3.6053038 ]]
singular value [6.30061232 0.54980396]


In [30]:
#my numpy implimentation, use svd
#ref <<scikit-learn 机器学习实战>>
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])   #[nsamples,nfeatures]
cov = np.cov(X.T) #equavalent to   print(np.dot(X.T,X)/(X.shape[0]-1))
print("Covariance matrix: ", cov)

#method 2--------------
U, S, V = np.linalg.svd(cov)  #v: eigen value  w: eigen vector
k=1
Z = np.dot(X, U[:,:k])
print(Z.shape,Z)
print("eigen value",S)

Covariance matrix:  [[5.6 3.6]
 [3.6 2.4]]
(6, 1) [[ 1.38340578]
 [ 2.22189802]
 [ 3.6053038 ]
 [-1.38340578]
 [-2.22189802]
 [-3.6053038 ]]
eigen value [7.93954312 0.06045688]


## Test with cleaned data(recenter+rescale)

In [128]:
#clean data, numpy
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])   #[nsamples,nfeatures]
X = X - X.mean(axis=0)
X = X / (X.max(axis=0) - X.min(axis=0))

cov = np.cov(X.T)
print("Covariance matrix: ", cov)
v, w = np.linalg.eig(cov)  #v: eigen value  w: eigen vector

idx = v.argsort()[::-1]  # sort eigen value
v = v[idx]
w = w[:,idx]
print("Eigenvalue vector: ", v)
print("Eigenvector: ", w)

#retain the first k feature axis
k=1
print("Sonuc: ", X.dot(w[:, :k]))

Covariance matrix:  [[0.15555556 0.15      ]
 [0.15       0.15      ]]
Eigenvalue vector:  [0.3028035  0.00275206]
Eigenvector:  [[ 0.71362292 -0.70053003]
 [ 0.70053003  0.71362292]]
Sonuc:  [[-0.29406966]
 [-0.41300682]
 [-0.70707648]
 [ 0.29406966]
 [ 0.41300682]
 [ 0.70707648]]


In [131]:
#clean data, numpy+svd
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])   #[nsamples,nfeatures]
X = X - X.mean(axis=0)
X = X / (X.max(axis=0) - X.min(axis=0))

U, S, V = np.linalg.svd(cov)  #v: eigen value  w: eigen vector
k=1
Z = np.dot(X, U[:,:k])
print(Z.shape,Z)

(6, 1) [[ 0.29406966]
 [ 0.41300682]
 [ 0.70707648]
 [-0.29406966]
 [-0.41300682]
 [-0.70707648]]


In [132]:
##clean data, numpy, scikit-learn
import numpy as np
from sklearn.decomposition import PCA
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])   #[nsamples,nfeatures]
X = X - X.mean(axis=0)
X = X / (X.max(axis=0) - X.min(axis=0))

pca = PCA(n_components=1)
pca1 = pca.fit(X)

print(pca1.transform(X))

[[ 0.29406966]
 [ 0.41300682]
 [ 0.70707648]
 [-0.29406966]
 [-0.41300682]
 [-0.70707648]]
