In [1]:
import numpy as np
####################################################################################################
               # HELPER FUNCTIONS FOR THE NOTEBOOK
####################################################################################################

def frobenius_norm_difference(A, B):
    """
        Calculates the Frobenius norm of the difference between two matrices.
    """
    return np.linalg.norm(A - B, 'fro') / np.linalg.norm(A, 'fro') * 100

def sample_covariance_estimator(X):
    """
        Estimates the sample covariance matrix of a data matrix.
    """
    n = X.shape[1]
    covariance_matrix = np.dot(X, X.T) / n
    
    return covariance_matrix

def generate_low_rank_data(n_features, n_samples, rank):
    """
    Generates a low rank data matrix.
    """
    rank = min(rank, n_features, n_samples)
    A = np.random.randn(n_features, rank) @ np.random.randn(rank, n_samples)
    
    return A



def compare_eigenvalues(eigvals_sample, eigvals_nystrom):
    relative_error = np.linalg.norm(eigvals_sample - eigvals_nystrom) / np.linalg.norm(eigvals_sample)
    return relative_error

def compare_eigenvectors(eigvecs_sample, eigvecs_nystrom, k):
    eigvecs_sample = eigvecs_sample[:, :k]
    eigvecs_nystrom = eigvecs_nystrom[:, :k]
    # Adjust signs
    for i in range(eigvecs_sample.shape[1]):
        if np.dot(eigvecs_sample[:, i], eigvecs_nystrom[:, i]) < 0:
            eigvecs_nystrom[:, i] *= -1
    
    frobenius_norm = np.linalg.norm(eigvecs_sample - eigvecs_nystrom, 'fro') / np.linalg.norm(eigvecs_sample, 'fro')
    return frobenius_norm

####################################################################################################
                # NYSTROM PRINCIPAL COMPONENT ANALYSIS 
####################################################################################################
def nystrom_pca(X, num_landmarks):
    """
        Estimates the principal components of a data matrix using the Nyström method.
    """
    p, n = X.shape  # p: number of features, n: number of samples

    # Step 1: Select landmark points (randomly select num_landmarks columns)
    indices_I = np.random.choice(p, num_landmarks, replace=False)
    Y = X[indices_I, :]  # X_I.shape = (num_landmarks, n)

    # Step 2: Define J and X_J
    indices_J = np.setdiff1d(np.arange(p), indices_I)
    Z = X[indices_J, :]  # X_J.shape = (p - num_landmarks, n)

    # Step 3: Compute Thin SVD of X_I
    U_Y, D_Y, V_Y_T = np.linalg.svd(Y, full_matrices=False)
    # Step 4: Construct W_I and W_J
    W_Y = (1 / np.sqrt(n)) * U_Y @ np.diag(D_Y)
    W_Z = (1 / np.sqrt(n)) * Z @ V_Y_T.T

    # Initialize W with the same shape as X
    W = np.zeros((p, num_landmarks))
    W[indices_I, :] = W_Y
    W[indices_J, :] = W_Z



    return  W @ W.T

####################################################################################################
                # EXAMPLE USAGE
####################################################################################################



In [57]:
def mse(Sigma, Sigma_hat, X, k):
    """
    Computes the mean square error (MSE) of the Nyström covariance estimator.

    Parameters:
    Sigma (numpy.ndarray): The true covariance matrix.
    Sigma_hat (numpy.ndarray): The estimated covariance matrix using Nyström method.
    X (numpy.ndarray): The data matrix.
    k (int): The number of landmarks used in the Nyström method.

    Returns:
    float: The difference between the MSE of the Nyström estimator and the sample covariance estimator.
    """
    n, p = X.shape
    Sigma_I = Sigma_hat

    # MSE of the sample covariance estimator of the Schur complement
    MSE_Sigma_I_Schur = (1 / (n - k)) * (np.trace(Sigma_I @ Sigma_I) + np.trace(Sigma_I) ** 2)

    # MSE of the sample covariance estimator
    MSE_Sigma = (1 / n) * (np.trace(Sigma @ Sigma) + np.trace(Sigma) ** 2)

    # Compute the MSE of the Nyström covariance estimator
    MSE = MSE_Sigma + (((n - k)** 2) / n ** 2) * (np.linalg.norm(Sigma_I, 'fro') - MSE_Sigma_I_Schur)

    return MSE

In [59]:
import pandas as pd
X = pd.read_pickle('russell_minute_data.pkl').values

# Sample covariance matrix
Sigma_simple = sample_covariance_estimator(X)
Sigma_nystrom = nystrom_pca(X, 10)

print(frobenius_norm_difference(Sigma_simple, Sigma_nystrom))
print(mse(Sigma_simple, Sigma_nystrom, X, 10))

45.413007866823236
0.003276069134803515
