### Implementation of algorithms Principal Component Analysis (PCA)

In [1]:
import numpy as np

In [2]:
class PCA_my:
    """
    n_components - int, float or None. If int, then the first n_components component will be used. 
            If it is float, then components will be used such that the amount of variance 
            that needs to be explained is greater than the percentage specified by n_components. 
            If None, then all components will be used.
            
    Attributes:
    principal_components_ - Principal axes in feature space, representing the directions of 
            maximum variance in the data. 
    eigenvalues_ - eigenvalues of the covariance matrix of X.
    explain_variance_ratio_ - Percentage of variance explained by each of the selected components.
    """
    
    def __init__(self, n_components=None):
        if type(n_components) in [int, float, type(None)]:
            self.n_components = n_components
        else:
            raise ValueError('n_components. Wrong value')  
        
        self.principal_components_ = None
        self.eigenvalues_ = None
        self.explain_variance_ratio_ = None
    
    def __standardization(self, X: np.ndarray) -> np.ndarray:
        mean = np.mean(X, axis=0)
        return X - mean, mean  
    
    def fit(self, X):
        """
        Input
            X - (n_samples, n_features) Training data, where n_samples is the number of samples and n_features 
            is the number of features.
        Output
            (n_samples, n_features) Transformed training data.
        """
        X_adjust, mean = self.__standardization(X)
        covariance_matrix = np.cov(X_adjust.T)
        eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)        
        eigenvalues, eigenvectors = zip(*sorted(list(zip(eigenvalues,eigenvectors.T)), reverse=True))
        eigenvalues, eigenvectors = np.array(eigenvalues), np.array(eigenvectors)
        self.eigenvalues_ = eigenvalues
        self.explain_variance_ratio_ = np.abs(eigenvalues/(np.sum(np.abs(eigenvalues))))        
        
        if self.n_components is None:
            self.n_components = eigenvectors.shape[0]
            
        if type(self.n_components) is int:
            self.principal_components_ = eigenvectors[:self.n_components]
        elif type(self.n_components) is float:
#             variance_significance = np.abs(eigenvalues/(np.sum(np.abs(eigenvalues))))
            sum_ = 0
            for idx, fraction in enumerate(self.explain_variance_ratio_):
                sum_ += fraction
                if sum_ >= self.n_components:
                    self.principal_components_ = eigenvectors[:idx+1]
                    break           

        X_new = np.dot(self.principal_components_, X_adjust.T).T
        
        return X_new

In [3]:
X = np.array([
    [2.5, 2.4],
    [0.5, 0.7],
    [2.2, 2.9],
    [1.9, 2.2],
    [3.1, 3.0],
    [2.3, 2.7],
    [2, 1.6],
    [1, 1.1],
    [1.5, 1.6],
    [1.1, 0.9]   
])

In [4]:
pca = PCA_my(n_components=2)
X_new = pca.fit(X)
print("X_new:\n", X_new)
print("Selected principal components:\n", pca.principal_components_)
print("Eigenvalues:\n", pca.eigenvalues_)
print("Percentage of variance explained:\n", pca.explain_variance_ratio_)

X_new:
 [[-0.82797019 -0.17511531]
 [ 1.77758033  0.14285723]
 [-0.99219749  0.38437499]
 [-0.27421042  0.13041721]
 [-1.67580142 -0.20949846]
 [-0.9129491   0.17528244]
 [ 0.09910944 -0.3498247 ]
 [ 1.14457216  0.04641726]
 [ 0.43804614  0.01776463]
 [ 1.22382056 -0.16267529]]
Selected principal components:
 [[-0.6778734  -0.73517866]
 [-0.73517866  0.6778734 ]]
Eigenvalues:
 [1.28402771 0.0490834 ]
Percentage of variance explained:
 [0.96318131 0.03681869]


In [5]:
from sklearn.decomposition import PCA
pca_sklearn = PCA(n_components=2)
X_new_sklearn = pca_sklearn.fit_transform(X)
pc_sklearn = pca_sklearn.components_

In [7]:
print("Data transformation. Current implementation equals implementation from sklearn: ", np.allclose(X_new, X_new_sklearn))
print("Principal components. Current implementation equals implementation from sklearn: ", np.allclose(pca.principal_components_, pc_sklearn))

Data transformation. Current implementation equals implementation from sklearn:  True
Principal components. Current implementation equals implementation from sklearn:  True


### References

<ul>
    <li><a href='https://builtin.com/data-science/step-step-explanation-principal-component-analysis'>https://builtin.com/data-science/step-step-explanation-principal-component-analysis</a></li>
    <li><a href='http://www.cs.otago.ac.nz/cosc453/student_tutorials/principal_components.pdf'>http://www.cs.otago.ac.nz/cosc453/student_tutorials/principal_components.pdf</a></li>
</ul>