# principle component analysis

In [None]:
import pandas as pd
import numpy as np

### Steps
1. standardization 
2. calculation of covariance matrix
3. calculate eigen values and eigen vectors 
4. create a feature vector to decide the principal components
5. Recast the data along the principal component axis

### 1 standardization

In [None]:
from sklearn.datasets import load_breast_cancer

In [None]:
cancer = load_breast_cancer (as_frame=True) # to load it as a dataframe

In [None]:
df=  cancer.frame
df

In [None]:
df.shape

In [None]:
x = df[cancer['feature_names']]
x.shape

In [None]:
# mean
x_mean = x.mean()

# SD
x_std = x.std()

# stabdardized
z = (x - x_mean)/x_std
z

### covarince matrix

In [None]:
c = z.cov()
c

In [None]:
# plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.heatmap(c)
plt.show()

In [None]:
# eigen values

eigval, eigvect = np.linalg.eig(c)

In [None]:
eigval

In [None]:
eigval.shape

In [None]:
eigvect.shape

sorting the eigen values and corresponding eigen vectors in descending order


In [None]:
a = np.array([40,12,32,60])
i = a.argsort()[::-1] # decsending order
i

In [None]:
i = eigval.argsort()[::-1] # return index of sorted array

eigval = eigval[i]
eigvect = eigvect[:,i]

In [None]:
# explained variance

a = np.array([1,4,7,9])
print(np.cumsum(a))
print(np.sum(a))

In [None]:
explained_variance = np.cumsum(eigval)/np.sum(eigval)
explained_variance

In [None]:
n = np.argmax(explained_variance > 0.5) + 1
n

In [None]:
# projecting the data into selected principal components

u = eigvect[:,:n]
pca_component = pd.DataFrame(u,
                             index = cancer['feature_names'],
                            columns = ['pc1',"pc2"])
pca_component

In [None]:
sns.heatmap(pca_component)

In [None]:
z_pca = z @ pca_component
z_pca = pd.DataFrame(z_pca.values,
                    columns = ['pca1','pca2'])

z_pca