# PCA implemention step by step

In [1]:
# imports
import numpy as np
import pandas as pd

In [21]:
# creating sample data frame
np.random.seed(42)

mu_vector1 = np.array([0,0,0])
cov_mat1 = np.array([[1,0,0],[0,1,0],[0,0,1]])
class1_sample = np.random.multivariate_normal(mu_vector1, cov_mat1, 20)

df = pd.DataFrame(class1_sample, columns=['f1','f2','f3'])
df['target']=1

mu_vector2 = np.array([1,1,1])
cov_mat2 = np.array([[1,0,0],[0,1,0],[0,0,1]])
class2_sample = np.random.multivariate_normal(mu_vector2, cov_mat2, 20)

df1 = pd.DataFrame(class2_sample, columns=['f1','f2','f3'])
df1['target'] = 0

df = pd.concat([df,df1], ignore_index=True).sort_index()
df.sample(40)

Unnamed: 0,f1,f2,f3,target
32,1.29612,1.261055,1.005113,0
36,1.25755,0.925554,-0.918771,0
35,1.404051,2.886186,1.174578,0
22,0.92799,2.003533,1.361636,0
11,-1.057711,0.822545,-1.220844,1
25,1.821903,1.087047,0.700993,0
13,0.196861,0.738467,0.171368,1
12,0.208864,-1.95967,-1.328186,1
9,0.375698,-0.600639,-0.291694,1
39,-0.168678,2.142823,1.751933,0


In [23]:
## Now let's visualize our dataframe in 3d graph using plotly
import plotly.express as px

fig = px.scatter_3d(df,
                    x=df['f1'],
                    y=df['f2'],
                    z=df['f3'],
                    color = df['target'].astype('str'))

fig.update_traces(marker=dict(size=12,line=dict(width=2,color='DarkSlateGrey')),selector=dict(mode='markers'))

fig.show()

In [24]:
## Now let's work on converting our 3-d features into 2-d features using pca

### Step-01 Apply standard scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df.iloc[:,0:3] = scaler.fit_transform(df.iloc[:,0:3])
df.describe()

Unnamed: 0,f1,f2,f3,target
count,40.0,40.0,40.0,40.0
mean,4.4408920000000007e-17,0.0,4.4408920000000007e-17,0.5
std,1.012739,1.012739,1.012739,0.50637
min,-2.033271,-1.977095,-1.712738,0.0
25%,-0.8482332,-0.728686,-0.6683381,0.0
50%,0.07106243,0.069194,0.05344254,0.5
75%,0.793006,0.653999,0.5640908,1.0
max,1.835061,1.969207,2.593645,1.0


In [29]:
# step-02 Find covariance matrix
covariance_matrix = np.cov(df.iloc[:, 0:3].values, rowvar=False)
print(f"Covariance Matrix: \n{covariance_matrix}")

Covariance Matrix: 
[[1.02564103 0.35175    0.07987862]
 [0.35175    1.02564103 0.28328008]
 [0.07987862 0.28328008 1.02564103]]


In [30]:
# step-03 Find ev and evs
eigen_value, eigen_vectors = np.linalg.eig(covariance_matrix)
eigen_value, eigen_vectors

(array([1.51825759, 0.94771609, 0.6109494 ]),
 array([[-0.56014874, -0.62132881,  0.5478904 ],
        [-0.67561935, -0.04005698, -0.73616162],
        [-0.47934526,  0.78252536,  0.3973439 ]]))

In [31]:
# top 2 pc's
pc = eigen_vectors[0:2]
pc

array([[-0.56014874, -0.62132881,  0.5478904 ],
       [-0.67561935, -0.04005698, -0.73616162]])

In [32]:
tranformed_df = np.dot(df.iloc[:,0:3], pc.T)
new_df = pd.DataFrame(tranformed_df, columns=['pc1', 'pc2'])
new_df['target'] = df['target'].values

In [33]:
new_df.head()

Unnamed: 0,pc1,pc2,target
0,0.416681,-0.206204,1
1,-0.708108,-0.59572,1
2,-1.364202,-0.535582,1
3,0.040356,0.443141,1
4,0.427521,1.532677,1
