In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [12]:
#step1: sample dataset (10records and 3 columns)
data=np.array([
    [2.5,2.4,1.2],
    [0.5,0.7,0.3],
    [2.2,2.9,1.4],
    [1.9,2.2,1.1],
    [3.1,3.0,1.7],
    [2.3,2.7,1.5],
    [2.0,1.6,0.9],
    [1.0,1.1,0.4],
    [1.5,1.6,0.8],
    [1.1,0.9,0.2],
])
df=pd.DataFrame(data,columns=["X","Y","Z"])
df

Unnamed: 0,X,Y,Z
0,2.5,2.4,1.2
1,0.5,0.7,0.3
2,2.2,2.9,1.4
3,1.9,2.2,1.1
4,3.1,3.0,1.7
5,2.3,2.7,1.5
6,2.0,1.6,0.9
7,1.0,1.1,0.4
8,1.5,1.6,0.8
9,1.1,0.9,0.2


In [13]:
#step2: Standardize using pandas (avoids NaN issues)
standardized_data=(df-df.mean())/df.std()

In [14]:
#step3= Covariance Matrix
cov_matrix=np.cov(standardized_data.T)
cov_matrix

array([[1.        , 0.92592927, 0.93688744],
       [0.92592927, 1.        , 0.98192228],
       [0.93688744, 0.98192228, 1.        ]])

In [15]:
#step4= Eigen decomposition
eig_vals,eig_vecs=np.linalg.eig(cov_matrix)

In [17]:
#step5: Sort the eigenvalues and eigenvectors
sorted_indices=np.argsort(eig_vals)[: :-1]
eig_vals=eig_vals[sorted_indices]
eig_vecs=eig_vecs[:, sorted_indices]

In [18]:
#step6: select two top eigen vectors
eig_vecs_subset=eig_vecs[:,:2]

In [19]:
#step7: Project data to new basis
reduced_data= standardized_data.to_numpy().dot(eig_vecs_subset)
reduced_df=pd.DataFrame(reduced_data,columns=["PC1","PC2"])
reduced_df

Unnamed: 0,PC1,PC2
0,1.114772,0.287461
1,-2.502976,-0.278755
2,1.461615,-0.429478
3,0.430754,-0.162708
4,2.517452,0.26013
5,1.508478,-0.278933
6,-0.129861,0.401224
7,-1.754639,-0.043244
8,-0.604269,-0.055335
9,-2.041325,0.29964


In [20]:
#step 8: output the reduced data
print("\nReduced Data (after PCA):\n",reduced_df)


Reduced Data (after PCA):
         PC1       PC2
0  1.114772  0.287461
1 -2.502976 -0.278755
2  1.461615 -0.429478
3  0.430754 -0.162708
4  2.517452  0.260130
5  1.508478 -0.278933
6 -0.129861  0.401224
7 -1.754639 -0.043244
8 -0.604269 -0.055335
9 -2.041325  0.299640
