In [1]:
import numpy as np

In [37]:
# matrix representation of the data - has shape (n, d)
data = np.array([
    [   1,   2,  -1,   4,  10],
    [   3,  -3,  -3,  12, -15],
    [   2,   1,  -2,   4,   5],
    [   5,   1,  -5,  10,   5],
    [   2,   3,  -3,   5,  12],
    [   4,   0,  -3,  16,   2],
])

In [57]:
class PCA:
    def __init__(self, k=2):
        self.k = 2

    def transform(self, data):
        standardized_data = (data - data.mean(axis=0))/data.std(axis=0)
        # use `ddof = 1` if using sample data (default assumption) and use `ddof = 0` if using population data
        covariance_matrix = np.cov(standardized_data, ddof=1, rowvar=False)
        # the column eigenvectors[:,i] is the eigenvector corresponding to the eigenvalue eigenvalues[i].
        eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)
        # utilize the sort order to sort eigenvalues and eigenvectors
        sort_order = np.argsort(eigenvalues)[::-1]
        sorted_eigen_values = eigen_values[sort_order]
        sorted_eigen_vectors = eigen_vectors[:,sort_order]  # sort the columns
        explained_variance = sorted_eigen_values/np.sum(eigen_values)
        print(f"Explained variance: {np.sum(explained_variance[:self.k])}")
        return np.matmul(standardized_data, sorted_eigen_vectors[:,:k])

In [58]:
pca = PCA()

In [59]:
pca.transform(data)

Explained variance: 0.9244022929534579


array([[ 2.3577116 , -0.75728867],
       [-2.27171739, -1.81970663],
       [ 1.21259114, -0.50390931],
       [-1.41935914,  1.9229856 ],
       [ 1.61562536,  0.87541857],
       [-1.49485157,  0.28250044]])