# Importing Necessary Libraries

In [1]:
# Dataframes
import numpy as np
import pandas as pd

# Numpy Statistical Methods
from numpy.linalg import svd
from numpy.linalg import eig

# Scikit Learn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Initializing Parameters

In [2]:
# Path of the folder
path = 'C:/Users/prash/Downloads/ML ALGORITHMS/'

# Number of final features required
pca_features = 3

# Importing and Cleaning Dataset

In [3]:
# Import Breast Cancer Dataset
data = pd.read_csv(path + 'DATASETS/' + 'breast_cancer_wisconsin.csv')

# drop last column (extra column added by pd and unnecessary first column (id)
data.drop(data.columns[[-1, 0]], axis=1, inplace=True)

# convert categorical labels to numbers
diag_map = {'M': 1.0, 'B': -1.0}
data['diagnosis'] = data['diagnosis'].map(diag_map)

# put features & outputs in different data frames
Y = data.loc[:, 'diagnosis']
X = data.iloc[:, 1:]

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Compute the mean of the data
mean_vec = np.mean(X, axis=0)

# Implementation from Scratch - Eigen Value Method

In [4]:
# Compute the covariance matrix
cov_mat = (X - mean_vec).T.dot((X - mean_vec)) / (X.shape[0]-1)

# Compute the eigen values and vectors using numpy
eig_vals, eig_vecs = eig(cov_mat)

# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort(key=lambda x: x[0], reverse=True)
      
        
# Compute the projection matrix based on the top eigen vectors
num_features = X.shape[1]
proj_mat = eig_pairs[0][1].reshape(num_features,1)

for eig_vec_idx in range(1, pca_features):
    proj_mat = np.hstack((proj_mat, eig_pairs[eig_vec_idx][1].reshape(num_features,1)))

# Project the data 
X_pca_eigen = X.dot(proj_mat)

print('The New Features obtained from Eigen Value Method are:\n\n',X_pca_eigen)

The New Features obtained from Eigen Value Method are:

 [[ 9.19283683  1.94858307 -1.12316616]
 [ 2.3878018  -3.76817174 -0.52929269]
 [ 5.73389628 -1.0751738  -0.55174759]
 ...
 [ 1.25617928 -1.90229671  0.56273053]
 [10.37479406  1.67201011 -1.87702933]
 [-5.4752433  -0.67063679  1.49044308]]


# Implementation from Scratch - Singular Value Decomposition (SVD) Method

In [5]:
# Getting Singular Vectors U,V and Singular Values S through Numpy's svd method
U,s,vt = svd((X - mean_vec), full_matrices = False)

# Converting s into a diagonal matrix
S = np.diag(s)

# Calculating the New Features
X_pca_svd = U[:,:pca_features].dot(S[:pca_features,:pca_features])

print('The New Features obtained from SVD Method are:\n\n',X_pca_svd)

The New Features obtained from SVD Method are:

 [[ -9.19283683  -1.94858307  -1.12316616]
 [ -2.3878018    3.76817174  -0.52929269]
 [ -5.73389628   1.0751738   -0.55174759]
 ...
 [ -1.25617928   1.90229671   0.56273053]
 [-10.37479406  -1.67201011  -1.87702933]
 [  5.4752433    0.67063679   1.49044308]]


# Scikit Learn Implementation

In [6]:
# Initializing the scikit learn model
pca = PCA(n_components = pca_features) 

# Fitting the model to the dataset
pca.fit((X-mean_vec)) 

# Transforming the dataset to get new features
X_pca_sklearn = pca.transform(X) 

print('The New Features obtained from Scikit Learn Method are:\n\n',X_pca_eigen)

The New Features obtained from Scikit Learn Method are:

 [[ 9.19283683  1.94858307 -1.12316616]
 [ 2.3878018  -3.76817174 -0.52929269]
 [ 5.73389628 -1.0751738  -0.55174759]
 ...
 [ 1.25617928 -1.90229671  0.56273053]
 [10.37479406  1.67201011 -1.87702933]
 [-5.4752433  -0.67063679  1.49044308]]


### References:
https://towardsdatascience.com/principal-component-analysis-your-tutorial-and-code-9719d3d3f376 <br>
https://towardsdatascience.com/dive-into-pca-principal-component-analysis-with-python-43ded13ead21 <br>
https://www.analyticsvidhya.com/blog/2019/08/5-applications-singular-value-decomposition-svd-data-science/